From 6c813c3fe9e30fcf3c4d94d2ba24108babd745b0 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 28 May 2006 22:50:18 -0700 Subject: [NETFILTER]: Fix small information leak in SO_ORIGINAL_DST (CVE-2006-1343) It appears that sockaddr_in.sin_zero is not zeroed during getsockopt(...SO_ORIGINAL_DST...) operation. This can lead to an information leak (CVE-2006-1343). Signed-off-by: Marcel Holtmann Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 1 + net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 979a2eac6f00..a297da7bbef5 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1318,6 +1318,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) .tuple.dst.u.tcp.port; sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.ip; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5bc9f64d7b5b..77d974443c7b 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -348,6 +348,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) .tuple.dst.u.tcp.port; sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.u3.ip; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); -- cgit v1.2.3 From ca3ba88d0cf4b5d7a628caf505c231162dde9429 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 28 May 2006 22:50:40 -0700 Subject: [NETFILTER]: mark H.323 helper experimental Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 3d560dec63ab..d4072533da21 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -170,8 +170,8 @@ config IP_NF_PPTP Documentation/modules.txt. If unsure, say `N'. config IP_NF_H323 - tristate 'H.323 protocol support' - depends on IP_NF_CONNTRACK + tristate 'H.323 protocol support (EXPERIMENTAL)' + depends on IP_NF_CONNTRACK && EXPERIMENTAL help H.323 is a VoIP signalling protocol from ITU-T. As one of the most important VoIP protocols, it is widely used by voice hardware and -- cgit v1.2.3 From 7114b0bb6df7b2db266ba4847e4dd8333fa98a9a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 28 May 2006 22:51:05 -0700 Subject: [NETFILTER]: PPTP helper: fix sstate/cstate typo Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_helper_pptp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c index 7d3ba4302e9e..8ccfe17bb253 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c @@ -469,8 +469,8 @@ pptp_inbound_pkt(struct sk_buff **pskb, DEBUGP("%s but no session\n", pptp_msg_name[msg]); break; } - if (info->sstate != PPTP_CALL_IN_REP - && info->sstate != PPTP_CALL_IN_CONF) { + if (info->cstate != PPTP_CALL_IN_REP + && info->cstate != PPTP_CALL_IN_CONF) { DEBUGP("%s but never sent IN_CALL_REPLY\n", pptp_msg_name[msg]); break; -- cgit v1.2.3 From fb80a6e1a521eb298edb4365429d533dd39427fa Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 2 Jun 2006 17:51:08 -0700 Subject: [TCP] tcp_highspeed: Fix problem observed by Xiaoliang (David) Wei When snd_cwnd is smaller than 38 and the connection is in congestion avoidance phase (snd_cwnd > snd_ssthresh), the snd_cwnd seems to stop growing. The additive increase was confused because C array's are 0 based. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_highspeed.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index b72fa55dfb84..ba7c63ca5bb1 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -135,7 +135,8 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, /* Do additive increase */ if (tp->snd_cwnd < tp->snd_cwnd_clamp) { - tp->snd_cwnd_cnt += ca->ai; + /* cwnd = cwnd + a(w) / cwnd */ + tp->snd_cwnd_cnt += ca->ai + 1; if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { tp->snd_cwnd_cnt -= tp->snd_cwnd; tp->snd_cwnd++; -- cgit v1.2.3 From f291196979ca80cdef199ca2b55e2758e8c23a0d Mon Sep 17 00:00:00 2001 From: Herbert Xu ~{PmVHI~} Date: Mon, 5 Jun 2006 15:03:37 -0700 Subject: [TCP]: Avoid skb_pull if possible when trimming head Trimming the head of an skb by calling skb_pull can cause the packet to become unaligned if the length pulled is odd. Since the length is entirely arbitrary for a FIN packet carrying data, this is actually quite common. Unaligned data is not the end of the world, but we should avoid it if it's easily done. In this case it is trivial. Since we're discarding all of the head data it doesn't matter whether we move skb->data forward or back. However, it is still possible to have unaligned skb->data in general. So network drivers should be prepared to handle it instead of crashing. This patch also adds an unlikely marking on len < headlen since partial ACKs on head data are extremely rare in the wild. As the return value of __pskb_trim_head is no longer ever NULL that has been removed. Signed-off-by: Herbert Xu ~{PmV>HI~} Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 743016baa048..f33c9dddaa12 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -642,7 +642,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss * eventually). The difference is that pulled data not copied, but * immediately discarded. */ -static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len) +static void __pskb_trim_head(struct sk_buff *skb, int len) { int i, k, eat; @@ -667,7 +667,6 @@ static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len) skb->tail = skb->data; skb->data_len -= len; skb->len = skb->data_len; - return skb->tail; } int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) @@ -676,12 +675,11 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; - if (len <= skb_headlen(skb)) { + /* If len == headlen, we avoid __skb_pull to preserve alignment. */ + if (unlikely(len < skb_headlen(skb))) __skb_pull(skb, len); - } else { - if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL) - return -ENOMEM; - } + else + __pskb_trim_head(skb, len - skb_headlen(skb)); TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_HW; -- cgit v1.2.3 From 79320d7e14900c549c3520791a297328f53ff71e Mon Sep 17 00:00:00 2001 From: Aki M Nyrhinen Date: Sun, 11 Jun 2006 21:18:56 -0700 Subject: [TCP]: continued: reno sacked_out count fix From: Aki M Nyrhinen IMHO the current fix to the problem (in_flight underflow in reno) is incorrect. it treats the symptons but ignores the problem. the problem is timing out packets other than the head packet when we don't have sack. i try to explain (sorry if explaining the obvious). with sack, scanning the retransmit queue for timed out packets is fine because we know which packets in our retransmit queue have been acked by the receiver. without sack, we know only how many packets in our retransmit queue the receiver has acknowledged, but no idea which packets. think of a "typical" slow-start overshoot case, where for example every third packet in a window get lost because a router buffer gets full. with sack, we check for timeouts on those every third packet (as the rest have been sacked). the packet counting works out and if there is no reordering, we'll retransmit exactly the packets that were lost. without sack, however, we check for timeout on every packet and end up retransmitting consecutive packets in the retransmit queue. in our slow-start example, 2/3 of those retransmissions are unnecessary. these unnecessary retransmissions eat the congestion window and evetually prevent fast recovery from continuing, if enough packets were lost. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4a538bc1683d..b5521a9d3dc1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1649,7 +1649,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (tcp_head_timedout(sk, tp)) { + if (!IsReno(tp) && tcp_head_timedout(sk, tp)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint @@ -1662,8 +1662,6 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - if (IsReno(tp)) - tcp_remove_reno_sacks(sk, tp, tcp_skb_pcount(skb) + 1); /* clear xmit_retrans hint */ if (tp->retransmit_skb_hint && -- cgit v1.2.3 From 42d1d52e695d87475846e9a09964cae1209eeecb Mon Sep 17 00:00:00 2001 From: Weidong Date: Mon, 12 Jun 2006 13:09:59 -0700 Subject: [IPV4]: Increment ipInHdrErrors when TTL expires. Signed-off-by: Weidong Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 0923add122b4..9f0bb529ab70 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -116,6 +116,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ + IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); -- cgit v1.2.3 From a1e8733e557bb390e13aa00ef044a6022c8d0bb2 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 17 Jun 2006 20:37:28 -0700 Subject: [NET]: Export ip_dev_find() Export ip_dev_find() to allow locating a net_device given an IP address. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- net/ipv4/fib_frontend.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cdde96390960..31387abf53a2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -666,3 +666,4 @@ void __init ip_fib_init(void) } EXPORT_SYMBOL(inet_addr_type); +EXPORT_SYMBOL(ip_dev_find); -- cgit v1.2.3 From 0e4b4992b8007c6b62ec143cbbb292f98813ca11 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:00:16 -0700 Subject: [I/OAT]: Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp.c | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index d0c2c2fa7587..578cccf275d3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -294,6 +294,8 @@ extern int tcp_rcv_established(struct sock *sk, extern void tcp_rcv_space_adjust(struct sock *sk); +extern void tcp_cleanup_rbuf(struct sock *sk, int copied); + extern int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e2b7b8055037..1c0cfd7a8bbb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo, * calculation of whether or not we must ACK for the sake of * a window update. */ -static void cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); int time_to_ack = 0; @@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, /* Clean up data we have read: This will do ACK frames. */ if (copied) - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); return copied; } @@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } } - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { /* Install new reader */ @@ -1391,7 +1391,7 @@ skip_copy: */ /* Clean up data we have read: This will do ACK frames. */ - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); TCP_CHECK_TIMER(sk); release_sock(sk); @@ -1858,7 +1858,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - cleanup_rbuf(sk, 1); + tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) icsk->icsk_ack.pingpong = 1; } -- cgit v1.2.3 From 624d1164730d58a494cc5aa4afa37d02c41e83a7 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:01:28 -0700 Subject: [I/OAT]: Make sk_eat_skb I/OAT aware. Add an extra argument to sk_eat_skb, and make it move early copied packets to the async_wait_queue instead of freeing them. Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- include/net/sock.h | 13 ++++++++++++- net/dccp/proto.c | 4 ++-- net/ipv4/tcp.c | 8 ++++---- net/llc/af_llc.c | 2 +- 4 files changed, 19 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/sock.h b/include/net/sock.h index 90c65cb091a8..75b0e97ed93d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1273,11 +1273,22 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ -static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) +#ifdef CONFIG_NET_DMA +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) +{ + __skb_unlink(skb, &sk->sk_receive_queue); + if (!copied_early) + __kfree_skb(skb); + else + __skb_queue_tail(&sk->sk_async_wait_queue, skb); +} +#else +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } +#endif extern void sock_enable_timestamp(struct sock *sk); extern int sock_get_timestamp(struct sock *, struct timeval __user *); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2e0ee8355c41..5317fd3e6691 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; @@ -773,7 +773,7 @@ verify_sock_status: } found_fin_ok: if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (1); out: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1c0cfd7a8bbb..4e067d25a63c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, break; } if (skb->h.th->fin) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); ++seq; break; } - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); if (!desc->count) break; } @@ -1356,14 +1356,14 @@ skip_copy: if (skb->h.th->fin) goto found_fin_ok; if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); continue; found_fin_ok: /* Process the FIN. */ ++*seq; if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (len > 0); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5a04db745c8d..7465170a36ca 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, continue; if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); *seq = 0; } } while (len > 0); -- cgit v1.2.3 From 9593782585e0cf70babe787a8463d492a68b1744 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:02:55 -0700 Subject: [I/OAT]: Add a sysctl for tuning the I/OAT offloaded I/O threshold Any socket recv of less than this ammount will not be offloaded Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- include/linux/sysctl.h | 1 + include/net/tcp.h | 1 + net/core/user_dma.c | 4 ++++ net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ 4 files changed, 16 insertions(+) (limited to 'net/ipv4') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 76eaeff76f82..cd9e7c0825ad 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -403,6 +403,7 @@ enum NET_TCP_MTU_PROBING=113, NET_TCP_BASE_MSS=114, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115, + NET_TCP_DMA_COPYBREAK=116, }; enum { diff --git a/include/net/tcp.h b/include/net/tcp.h index 578cccf275d3..f1f472746e6c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -219,6 +219,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; +extern int sysctl_tcp_dma_copybreak; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; diff --git a/net/core/user_dma.c b/net/core/user_dma.c index 9eee91bcbf3f..b7c98dbcdb81 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -30,6 +30,10 @@ #include /* for BUG_TRAP */ #include +#define NET_DMA_DEFAULT_COPYBREAK 4096 + +int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; + /** * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. * @skb - buffer to copy diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6b6c3adfcf00..6a6aa537b7aa 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -688,6 +688,16 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#ifdef CONFIG_NET_DMA + { + .ctl_name = NET_TCP_DMA_COPYBREAK, + .procname = "tcp_dma_copybreak", + .data = &sysctl_tcp_dma_copybreak, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif { .ctl_name = 0 } }; -- cgit v1.2.3 From 1a2449a87bb7606113b1aa1a9d3c3e78ef189a1c Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:05:53 -0700 Subject: [I/OAT]: TCP recv offload to I/OAT Locks down user pages and sets up for DMA in tcp_recvmsg, then calls dma_async_try_early_copy in tcp_v4_do_rcv Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 103 +++++++++++++++++++++++++++++++++++++++++++++------ net/ipv4/tcp_input.c | 74 ++++++++++++++++++++++++++++++++---- net/ipv4/tcp_ipv4.c | 18 ++++++++- net/ipv6/tcp_ipv6.c | 12 +++++- 4 files changed, 185 insertions(+), 22 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e067d25a63c..ff6ccda9ff46 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -263,7 +263,7 @@ #include #include #include - +#include #include #include @@ -1110,6 +1110,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; + int copied_early = 0; lock_sock(sk); @@ -1133,6 +1134,17 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); +#ifdef CONFIG_NET_DMA + tp->ucopy.dma_chan = NULL; + preempt_disable(); + if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && + !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) { + preempt_enable_no_resched(); + tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); + } else + preempt_enable_no_resched(); +#endif + do { struct sk_buff *skb; u32 offset; @@ -1274,6 +1286,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else sk_wait_data(sk, &timeo); +#ifdef CONFIG_NET_DMA + tp->ucopy.wakeup = 0; +#endif + if (user_recv) { int chunk; @@ -1329,13 +1345,39 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; +#ifdef CONFIG_NET_DMA + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan) { + tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( + tp->ucopy.dma_chan, skb, offset, + msg->msg_iov, used, + tp->ucopy.pinned_list); + + if (tp->ucopy.dma_cookie < 0) { + + printk(KERN_ALERT "dma_cookie < 0\n"); + + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + if ((offset + used) == skb->len) + copied_early = 1; + + } else +#endif + { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } } } @@ -1355,15 +1397,19 @@ skip_copy: if (skb->h.th->fin) goto found_fin_ok; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb, 0); + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb, 0); + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } break; } while (len > 0); @@ -1386,6 +1432,36 @@ skip_copy: tp->ucopy.len = 0; } +#ifdef CONFIG_NET_DMA + if (tp->ucopy.dma_chan) { + struct sk_buff *skb; + dma_cookie_t done, used; + + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + + while (dma_async_memcpy_complete(tp->ucopy.dma_chan, + tp->ucopy.dma_cookie, &done, + &used) == DMA_IN_PROGRESS) { + /* do partial cleanup of sk_async_wait_queue */ + while ((skb = skb_peek(&sk->sk_async_wait_queue)) && + (dma_async_is_complete(skb->dma_cookie, done, + used) == DMA_SUCCESS)) { + __skb_dequeue(&sk->sk_async_wait_queue); + kfree_skb(skb); + } + } + + /* Safe to free early-copied skbs now */ + __skb_queue_purge(&sk->sk_async_wait_queue); + dma_chan_put(tp->ucopy.dma_chan); + tp->ucopy.dma_chan = NULL; + } + if (tp->ucopy.pinned_list) { + dma_unpin_iovec_pages(tp->ucopy.pinned_list); + tp->ucopy.pinned_list = NULL; + } +#endif + /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ @@ -1658,6 +1734,9 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); sk_stream_writequeue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); +#ifdef CONFIG_NET_DMA + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif inet->dport = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b5521a9d3dc1..c6d62f0a9966 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -71,6 +71,7 @@ #include #include #include +#include int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; @@ -3785,6 +3786,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk __tcp_checksum_complete_user(sk, skb); } +#ifdef CONFIG_NET_DMA +static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int chunk = skb->len - hlen; + int dma_cookie; + int copied_early = 0; + + if (tp->ucopy.wakeup) + return 0; + + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { + + dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, + skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); + + if (dma_cookie < 0) + goto out; + + tp->ucopy.dma_cookie = dma_cookie; + copied_early = 1; + + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + tcp_rcv_space_adjust(sk); + + if ((tp->ucopy.len == 0) || + (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || + (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } + } else if (chunk > 0) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } +out: + return copied_early; +} +#endif /* CONFIG_NET_DMA */ + /* * TCP receive function for the ESTABLISHED state. * @@ -3901,14 +3946,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else { int eaten = 0; + int copied_early = 0; - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); + if (tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len) { +#ifdef CONFIG_NET_DMA + if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + copied_early = 1; + eaten = 1; + } +#endif + if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { + __set_current_state(TASK_RUNNING); - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) + eaten = 1; + } + if (eaten) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -3924,8 +3978,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; } + if (copied_early) + tcp_cleanup_rbuf(sk, skb->len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) @@ -3966,6 +4021,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __tcp_ack_snd_check(sk, 0); no_ack: +#ifdef CONFIG_NET_DMA + if (copied_early) + __skb_queue_tail(&sk->sk_async_wait_queue, skb); + else +#endif if (eaten) __kfree_skb(skb); else diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 672950e54c49..25ecc6e2478b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -1091,8 +1092,18 @@ process: bh_lock_sock(sk); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + if (tp->ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v4_do_rcv(sk, skb); + } } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); @@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefully empty, out_of_order_queue. */ __skb_queue_purge(&tp->out_of_order_queue); +#ifdef CONFIG_NET_DMA + /* Cleans up our sk_async_wait_queue */ + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif + /* Clean prequeue, it must be empty really */ __skb_queue_purge(&tp->ucopy.prequeue); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 301eee726b0f..a50eb306e9e2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1218,8 +1218,16 @@ process: bh_lock_sock(sk); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (tp->ucopy.dma_chan) + ret = tcp_v6_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v6_do_rcv(sk, skb); + } } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); -- cgit v1.2.3 From 15986e1aadbbf40a331cddd0470bb434d156431d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 25 May 2006 16:11:14 -0700 Subject: [TCP]: tcp_rcv_rtt_measure_ts() call in pure-ACK path is superfluous We only want to take receive RTT mesaurements for data bearing frames, here in the header prediction fast path for a pure-sender, we know that we have a pure-ACK and thus the checks in tcp_rcv_rtt_mesaure_ts() will not pass. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c6d62f0a9966..6d167889a4b0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3931,8 +3931,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(sk, skb); - /* We know that such packets are checksummed * on entry. */ -- cgit v1.2.3 From 546be2405be119ef55467aace45f337a16e5d424 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 27 May 2006 23:03:58 -0700 Subject: [IPSEC] xfrm: Undo afinfo lock proliferation The number of locks used to manage afinfo structures can easily be reduced down to one each for policy and state respectively. This is based on the observation that the write locks are only held by module insertion/removal which are very rare events so there is no need to further differentiate between the insertion of modules like ipv6 versus esp6. The removal of the read locks in xfrm4_policy.c/xfrm6_policy.c might look suspicious at first. However, after you realise that nobody ever takes the corresponding write lock you'll feel better :) As far as I can gather it's an attempt to guard against the removal of the corresponding modules. Since neither module can be unloaded at all we can leave it to whoever fixes up IPv6 unloading :) Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/net/xfrm.h | 9 +------- net/ipv4/xfrm4_policy.c | 6 ----- net/ipv4/xfrm4_state.c | 1 - net/ipv6/xfrm6_policy.c | 6 ----- net/ipv6/xfrm6_state.c | 1 - net/xfrm/xfrm_policy.c | 58 +++++++++++++++++++++++++++++-------------------- net/xfrm/xfrm_state.c | 9 +++----- 7 files changed, 38 insertions(+), 52 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index afa508d92c93..ed7c9747059d 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -204,8 +204,7 @@ struct xfrm_type; struct xfrm_dst; struct xfrm_policy_afinfo { unsigned short family; - rwlock_t lock; - struct xfrm_type_map *type_map; + struct xfrm_type *type_map[256]; struct dst_ops *dst_ops; void (*garbage_collect)(void); int (*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl); @@ -232,7 +231,6 @@ extern int __xfrm_state_delete(struct xfrm_state *x); struct xfrm_state_afinfo { unsigned short family; - rwlock_t lock; struct list_head *state_bydst; struct list_head *state_byspi; int (*init_flags)(struct xfrm_state *x); @@ -264,11 +262,6 @@ struct xfrm_type u32 (*get_max_size)(struct xfrm_state *, int size); }; -struct xfrm_type_map { - rwlock_t lock; - struct xfrm_type *map[256]; -}; - extern int xfrm_register_type(struct xfrm_type *type, unsigned short family); extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family); extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 8604c747bca5..c0465284dfac 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -17,8 +17,6 @@ static struct dst_ops xfrm4_dst_ops; static struct xfrm_policy_afinfo xfrm4_policy_afinfo; -static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED }; - static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) { return __ip_route_output_key((struct rtable**)dst, fl); @@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) static inline int xfrm4_garbage_collect(void) { - read_lock(&xfrm4_policy_afinfo.lock); xfrm4_policy_afinfo.garbage_collect(); - read_unlock(&xfrm4_policy_afinfo.lock); return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); } @@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = { static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .family = AF_INET, - .lock = RW_LOCK_UNLOCKED, - .type_map = &xfrm4_type_map, .dst_ops = &xfrm4_dst_ops, .dst_lookup = xfrm4_dst_lookup, .find_bundle = __xfrm4_find_bundle, diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index dbabf81a9b7b..81e1751c966e 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, - .lock = RW_LOCK_UNLOCKED, .init_flags = xfrm4_init_flags, .init_tempsel = __xfrm4_init_tempsel, .state_lookup = __xfrm4_state_lookup, diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 88c840f1beb6..ee715f2691e9 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -23,8 +23,6 @@ static struct dst_ops xfrm6_dst_ops; static struct xfrm_policy_afinfo xfrm6_policy_afinfo; -static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED }; - static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) { int err = 0; @@ -249,9 +247,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) static inline int xfrm6_garbage_collect(void) { - read_lock(&xfrm6_policy_afinfo.lock); xfrm6_policy_afinfo.garbage_collect(); - read_unlock(&xfrm6_policy_afinfo.lock); return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); } @@ -311,8 +307,6 @@ static struct dst_ops xfrm6_dst_ops = { static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .family = AF_INET6, - .lock = RW_LOCK_UNLOCKED, - .type_map = &xfrm6_type_map, .dst_ops = &xfrm6_dst_ops, .dst_lookup = xfrm6_dst_lookup, .find_bundle = __xfrm6_find_bundle, diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index a5723024d3b3..b33296b3f6de 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -135,7 +135,6 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 proto, static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, - .lock = RW_LOCK_UNLOCKED, .init_tempsel = __xfrm6_init_tempsel, .state_lookup = __xfrm6_state_lookup, .find_acq = __xfrm6_find_acq, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b469c8b54613..44b64a593c01 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -46,45 +46,43 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock); static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); +static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family); +static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo); int xfrm_register_type(struct xfrm_type *type, unsigned short family) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - struct xfrm_type_map *typemap; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family); + struct xfrm_type **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; typemap = afinfo->type_map; - write_lock_bh(&typemap->lock); - if (likely(typemap->map[type->proto] == NULL)) - typemap->map[type->proto] = type; + if (likely(typemap[type->proto] == NULL)) + typemap[type->proto] = type; else err = -EEXIST; - write_unlock_bh(&typemap->lock); - xfrm_policy_put_afinfo(afinfo); + xfrm_policy_unlock_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_register_type); int xfrm_unregister_type(struct xfrm_type *type, unsigned short family) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - struct xfrm_type_map *typemap; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family); + struct xfrm_type **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; typemap = afinfo->type_map; - write_lock_bh(&typemap->lock); - if (unlikely(typemap->map[type->proto] != type)) + if (unlikely(typemap[type->proto] != type)) err = -ENOENT; else - typemap->map[type->proto] = NULL; - write_unlock_bh(&typemap->lock); - xfrm_policy_put_afinfo(afinfo); + typemap[type->proto] = NULL; + xfrm_policy_unlock_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_unregister_type); @@ -92,7 +90,7 @@ EXPORT_SYMBOL(xfrm_unregister_type); struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) { struct xfrm_policy_afinfo *afinfo; - struct xfrm_type_map *typemap; + struct xfrm_type **typemap; struct xfrm_type *type; int modload_attempted = 0; @@ -102,11 +100,9 @@ retry: return NULL; typemap = afinfo->type_map; - read_lock(&typemap->lock); - type = typemap->map[proto]; + type = typemap[proto]; if (unlikely(type && !try_module_get(type->owner))) type = NULL; - read_unlock(&typemap->lock); if (!type && !modload_attempted) { xfrm_policy_put_afinfo(afinfo); request_module("xfrm-type-%d-%d", @@ -1306,17 +1302,31 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) return NULL; read_lock(&xfrm_policy_afinfo_lock); afinfo = xfrm_policy_afinfo[family]; - if (likely(afinfo != NULL)) - read_lock(&afinfo->lock); - read_unlock(&xfrm_policy_afinfo_lock); + if (unlikely(!afinfo)) + read_unlock(&xfrm_policy_afinfo_lock); return afinfo; } static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) { - if (unlikely(afinfo == NULL)) - return; - read_unlock(&afinfo->lock); + read_unlock(&xfrm_policy_afinfo_lock); +} + +static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family) +{ + struct xfrm_policy_afinfo *afinfo; + if (unlikely(family >= NPROTO)) + return NULL; + write_lock_bh(&xfrm_policy_afinfo_lock); + afinfo = xfrm_policy_afinfo[family]; + if (unlikely(!afinfo)) + write_unlock_bh(&xfrm_policy_afinfo_lock); + return afinfo; +} + +static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + write_unlock_bh(&xfrm_policy_afinfo_lock); } static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 93a2f36ad3db..ee62c239a7e3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1103,17 +1103,14 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family) return NULL; read_lock(&xfrm_state_afinfo_lock); afinfo = xfrm_state_afinfo[family]; - if (likely(afinfo != NULL)) - read_lock(&afinfo->lock); - read_unlock(&xfrm_state_afinfo_lock); + if (unlikely(!afinfo)) + read_unlock(&xfrm_state_afinfo_lock); return afinfo; } static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) { - if (unlikely(afinfo == NULL)) - return; - read_unlock(&afinfo->lock); + read_unlock(&xfrm_state_afinfo_lock); } /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ -- cgit v1.2.3 From b59f45d0b2878ab76f8053b0973654e6621828ee Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 27 May 2006 23:05:54 -0700 Subject: [IPSEC] xfrm: Abstract out encapsulation modes This patch adds the structure xfrm_mode. It is meant to represent the operations carried out by transport/tunnel modes. By doing this we allow additional encapsulation modes to be added without clogging up the xfrm_input/xfrm_output paths. Candidate modes include 4-to-6 tunnel mode, 6-to-4 tunnel mode, and BEET modes. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/xfrm.h | 4 ++ include/net/xfrm.h | 17 ++++++ net/ipv4/Kconfig | 18 ++++++ net/ipv4/Makefile | 2 + net/ipv4/xfrm4_input.c | 28 +-------- net/ipv4/xfrm4_mode_transport.c | 69 ++++++++++++++++++++++ net/ipv4/xfrm4_mode_tunnel.c | 125 ++++++++++++++++++++++++++++++++++++++++ net/ipv4/xfrm4_output.c | 61 +------------------- net/ipv6/Kconfig | 20 +++++++ net/ipv6/Makefile | 2 + net/ipv6/ip6_output.c | 2 + net/ipv6/xfrm6_input.c | 29 +--------- net/ipv6/xfrm6_mode_transport.c | 73 +++++++++++++++++++++++ net/ipv6/xfrm6_mode_tunnel.c | 121 ++++++++++++++++++++++++++++++++++++++ net/ipv6/xfrm6_output.c | 63 +------------------- net/xfrm/xfrm_policy.c | 83 ++++++++++++++++++++++++++ net/xfrm/xfrm_state.c | 6 ++ 17 files changed, 553 insertions(+), 170 deletions(-) create mode 100644 net/ipv4/xfrm4_mode_transport.c create mode 100644 net/ipv4/xfrm4_mode_tunnel.c create mode 100644 net/ipv6/xfrm6_mode_transport.c create mode 100644 net/ipv6/xfrm6_mode_tunnel.c (limited to 'net/ipv4') diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 6b42cc474c01..46a15c7a1a13 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -118,6 +118,10 @@ enum XFRM_SHARE_UNIQUE /* Use once */ }; +#define XFRM_MODE_TRANSPORT 0 +#define XFRM_MODE_TUNNEL 1 +#define XFRM_MODE_MAX 2 + /* Netlink configuration messages. */ enum { XFRM_MSG_BASE = 0x10, diff --git a/include/net/xfrm.h b/include/net/xfrm.h index ed7c9747059d..ed5bb34f817f 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -20,6 +20,8 @@ #include #define XFRM_ALIGN8(len) (((len) + 7) & ~7) +#define MODULE_ALIAS_XFRM_MODE(family, encap) \ + MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap)) extern struct sock *xfrm_nl; extern u32 sysctl_xfrm_aevent_etime; @@ -164,6 +166,7 @@ struct xfrm_state /* Reference to data common to all the instances of this * transformer. */ struct xfrm_type *type; + struct xfrm_mode *mode; /* Security context */ struct xfrm_sec_ctx *security; @@ -205,6 +208,7 @@ struct xfrm_dst; struct xfrm_policy_afinfo { unsigned short family; struct xfrm_type *type_map[256]; + struct xfrm_mode *mode_map[XFRM_MODE_MAX]; struct dst_ops *dst_ops; void (*garbage_collect)(void); int (*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl); @@ -267,6 +271,19 @@ extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family); extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family); extern void xfrm_put_type(struct xfrm_type *type); +struct xfrm_mode { + int (*input)(struct xfrm_state *x, struct sk_buff *skb); + int (*output)(struct sk_buff *skb); + + struct module *owner; + unsigned int encap; +}; + +extern int xfrm_register_mode(struct xfrm_mode *mode, int family); +extern int xfrm_unregister_mode(struct xfrm_mode *mode, int family); +extern struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family); +extern void xfrm_put_mode(struct xfrm_mode *mode); + struct xfrm_tmpl { /* id in template is interpreted as: diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e40f75322377..35eb70b1448d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -414,6 +414,24 @@ config INET_TUNNEL tristate default n +config INET_XFRM_MODE_TRANSPORT + tristate "IP: IPsec transport mode" + default y + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET_XFRM_MODE_TUNNEL + tristate "IP: IPsec tunnel mode" + default y + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9ef50a0b9d2c..4cc94482eacc 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o +obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o +obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 3e174c83bfe7..817ed84511a6 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include @@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb) EXPORT_SYMBOL(xfrm4_rcv); -static inline void ipip_ecn_decapsulate(struct sk_buff *skb) -{ - struct iphdr *outer_iph = skb->nh.iph; - struct iphdr *inner_iph = skb->h.ipiph; - - if (INET_ECN_is_ce(outer_iph->tos)) - IP_ECN_set_ce(inner_iph); -} - static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) { switch (nexthdr) { @@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) xfrm_vec[xfrm_nr++] = x; - iph = skb->nh.iph; + if (x->mode->input(x, skb)) + goto drop; if (x->props.mode) { - if (iph->protocol != IPPROTO_IPIP) - goto drop; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto drop; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - goto drop; - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv4_copy_dscp(iph, skb->h.ipiph); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip_ecn_decapsulate(skb); - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); decaps = 1; break; } diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c new file mode 100644 index 000000000000..e46d9a4ccc55 --- /dev/null +++ b/net/ipv4/xfrm4_mode_transport.c @@ -0,0 +1,69 @@ +/* + * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The IP header will be moved forward to make space for the encapsulation + * header. + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_transport_output(struct sk_buff *skb) +{ + struct xfrm_state *x; + struct iphdr *iph; + int ihl; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + ihl = iph->ihl * 4; + skb->h.raw += ihl; + + x = skb->dst->xfrm; + skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl); + return 0; +} + +static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return 0; +} + +static struct xfrm_mode xfrm4_transport_mode = { + .input = xfrm4_transport_input, + .output = xfrm4_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm4_transport_init(void) +{ + return xfrm_register_mode(&xfrm4_transport_mode, AF_INET); +} + +static void __exit xfrm4_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_transport_init); +module_exit(xfrm4_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c new file mode 100644 index 000000000000..f8d880beb12f --- /dev/null +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -0,0 +1,125 @@ +/* + * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void ipip_ecn_decapsulate(struct sk_buff *skb) +{ + struct iphdr *outer_iph = skb->nh.iph; + struct iphdr *inner_iph = skb->h.ipiph; + + if (INET_ECN_is_ce(outer_iph->tos)) + IP_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. The following fields + * in it shall be filled in by x->type->output: + * tot_len + * check + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_tunnel_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct iphdr *iph, *top_iph; + int flags; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + skb->nh.raw = skb_push(skb, x->props.header_len); + top_iph = skb->nh.iph; + + top_iph->ihl = 5; + top_iph->version = 4; + + /* DS disclosed */ + top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); + + flags = x->props.flags; + if (flags & XFRM_STATE_NOECN) + IP_ECN_clear(top_iph); + + top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? + 0 : (iph->frag_off & htons(IP_DF)); + if (!top_iph->frag_off) + __ip_select_ident(top_iph, dst->child, 0); + + top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + top_iph->protocol = IPPROTO_IPIP; + + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + return 0; +} + +static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + int err = -EINVAL; + + if (iph->protocol != IPPROTO_IPIP) + goto out; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(iph, skb->h.ipiph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm4_tunnel_mode = { + .input = xfrm4_tunnel_input, + .output = xfrm4_tunnel_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, +}; + +static int __init xfrm4_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); +} + +static void __exit xfrm4_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_tunnel_init); +module_exit(xfrm4_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 4ef8efaf6a67..ac9d91d4bb05 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -12,67 +12,10 @@ #include #include #include -#include #include #include #include -/* Add encapsulation header. - * - * In transport mode, the IP header will be moved forward to make space - * for the encapsulation header. - * - * In tunnel mode, the top IP header will be constructed per RFC 2401. - * The following fields in it shall be filled in by x->type->output: - * tot_len - * check - * - * On exit, skb->h will be set to the start of the payload to be processed - * by x->type->output and skb->nh will be set to the top IP header. - */ -static void xfrm4_encap(struct sk_buff *skb) -{ - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct iphdr *iph, *top_iph; - int flags; - - iph = skb->nh.iph; - skb->h.ipiph = iph; - - skb->nh.raw = skb_push(skb, x->props.header_len); - top_iph = skb->nh.iph; - - if (!x->props.mode) { - skb->h.raw += iph->ihl*4; - memmove(top_iph, iph, iph->ihl*4); - return; - } - - top_iph->ihl = 5; - top_iph->version = 4; - - /* DS disclosed */ - top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); - - flags = x->props.flags; - if (flags & XFRM_STATE_NOECN) - IP_ECN_clear(top_iph); - - top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? - 0 : (iph->frag_off & htons(IP_DF)); - if (!top_iph->frag_off) - __ip_select_ident(top_iph, dst->child, 0); - - top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); - - top_iph->saddr = x->props.saddr.a4; - top_iph->daddr = x->id.daddr.a4; - top_iph->protocol = IPPROTO_IPIP; - - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); -} - static int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb) if (err) goto error; - xfrm4_encap(skb); + err = x->mode->output(skb); + if (err) + goto error; err = x->type->output(x, skb); if (err) diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index f8a107ab5592..e923d4dea418 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -106,6 +106,26 @@ config INET6_TUNNEL tristate default n +config INET6_XFRM_MODE_TRANSPORT + tristate "IPv6: IPsec transport mode" + depends on IPV6 + default IPV6 + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET6_XFRM_MODE_TUNNEL + tristate "IPv6: IPsec tunnel mode" + depends on IPV6 + default IPV6 + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + config IPV6_TUNNEL tristate "IPv6: IPv6-in-IPv6 tunnel" select INET6_TUNNEL diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index a760b0988fbb..386e0a626948 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -20,6 +20,8 @@ obj-$(CONFIG_INET6_ESP) += esp6.o obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o +obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o +obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e46048974f37..416f6e428a0a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -488,6 +489,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) return offset; } +EXPORT_SYMBOL_GPL(ip6_find_1stfragopt); static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 00cfdee18dca..0405d74ff910 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -13,21 +13,9 @@ #include #include #include -#include -#include -#include #include #include -static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) -{ - struct ipv6hdr *outer_iph = skb->nh.ipv6h; - struct ipv6hdr *inner_iph = skb->h.ipv6h; - - if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) - IP6_ECN_set_ce(inner_iph); -} - int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi) { int err; @@ -81,21 +69,10 @@ int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi) xfrm_vec[xfrm_nr++] = x; + if (x->mode->input(x, skb)) + goto drop; + if (x->props.mode) { /* XXX */ - if (nexthdr != IPPROTO_IPV6) - goto drop; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - goto drop; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - goto drop; - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip6_ecn_decapsulate(skb); - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; decaps = 1; break; } diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c new file mode 100644 index 000000000000..5efbbae08ef0 --- /dev/null +++ b/net/ipv6/xfrm6_mode_transport.c @@ -0,0 +1,73 @@ +/* + * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The IP header and mutable extension headers will be moved forward to make + * space for the encapsulation header. + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static int xfrm6_transport_output(struct sk_buff *skb) +{ + struct xfrm_state *x = skb->dst->xfrm; + struct ipv6hdr *iph; + u8 *prevhdr; + int hdr_len; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + hdr_len = ip6_find_1stfragopt(skb, &prevhdr); + skb->nh.raw = prevhdr - x->props.header_len; + skb->h.raw = skb->data + hdr_len; + memmove(skb->data, iph, hdr_len); + return 0; +} + +static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return 0; +} + +static struct xfrm_mode xfrm6_transport_mode = { + .input = xfrm6_transport_input, + .output = xfrm6_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm6_transport_init(void) +{ + return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6); +} + +static void __exit xfrm6_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_transport_init); +module_exit(xfrm6_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT); diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c new file mode 100644 index 000000000000..8af79be2edca --- /dev/null +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -0,0 +1,121 @@ +/* + * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) +{ + struct ipv6hdr *outer_iph = skb->nh.ipv6h; + struct ipv6hdr *inner_iph = skb->h.ipv6h; + + if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) + IP6_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. The following fields + * in it shall be filled in by x->type->output: + * payload_len + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static int xfrm6_tunnel_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct ipv6hdr *iph, *top_iph; + int dsfield; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + skb->nh.raw = skb->data; + top_iph = skb->nh.ipv6h; + skb->nh.raw = &top_iph->nexthdr; + skb->h.ipv6h = top_iph + 1; + + top_iph->version = 6; + top_iph->priority = iph->priority; + top_iph->flow_lbl[0] = iph->flow_lbl[0]; + top_iph->flow_lbl[1] = iph->flow_lbl[1]; + top_iph->flow_lbl[2] = iph->flow_lbl[2]; + dsfield = ipv6_get_dsfield(top_iph); + dsfield = INET_ECN_encapsulate(dsfield, dsfield); + if (x->props.flags & XFRM_STATE_NOECN) + dsfield &= ~INET_ECN_MASK; + ipv6_change_dsfield(top_iph, 0, dsfield); + top_iph->nexthdr = IPPROTO_IPV6; + top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); + ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); + ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + return 0; +} + +static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = -EINVAL; + + if (skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPV6) + goto out; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip6_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm6_tunnel_mode = { + .input = xfrm6_tunnel_input, + .output = xfrm6_tunnel_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, +}; + +static int __init xfrm6_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6); +} + +static void __exit xfrm6_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_tunnel_init); +module_exit(xfrm6_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 80242172a5df..16e84254a252 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -14,68 +14,9 @@ #include #include #include -#include -#include #include #include -/* Add encapsulation header. - * - * In transport mode, the IP header and mutable extension headers will be moved - * forward to make space for the encapsulation header. - * - * In tunnel mode, the top IP header will be constructed per RFC 2401. - * The following fields in it shall be filled in by x->type->output: - * payload_len - * - * On exit, skb->h will be set to the start of the encapsulation header to be - * filled in by x->type->output and skb->nh will be set to the nextheader field - * of the extension header directly preceding the encapsulation header, or in - * its absence, that of the top IP header. The value of skb->data will always - * point to the top IP header. - */ -static void xfrm6_encap(struct sk_buff *skb) -{ - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct ipv6hdr *iph, *top_iph; - int dsfield; - - skb_push(skb, x->props.header_len); - iph = skb->nh.ipv6h; - - if (!x->props.mode) { - u8 *prevhdr; - int hdr_len; - - hdr_len = ip6_find_1stfragopt(skb, &prevhdr); - skb->nh.raw = prevhdr - x->props.header_len; - skb->h.raw = skb->data + hdr_len; - memmove(skb->data, iph, hdr_len); - return; - } - - skb->nh.raw = skb->data; - top_iph = skb->nh.ipv6h; - skb->nh.raw = &top_iph->nexthdr; - skb->h.ipv6h = top_iph + 1; - - top_iph->version = 6; - top_iph->priority = iph->priority; - top_iph->flow_lbl[0] = iph->flow_lbl[0]; - top_iph->flow_lbl[1] = iph->flow_lbl[1]; - top_iph->flow_lbl[2] = iph->flow_lbl[2]; - dsfield = ipv6_get_dsfield(top_iph); - dsfield = INET_ECN_encapsulate(dsfield, dsfield); - if (x->props.flags & XFRM_STATE_NOECN) - dsfield &= ~INET_ECN_MASK; - ipv6_change_dsfield(top_iph, 0, dsfield); - top_iph->nexthdr = IPPROTO_IPV6; - top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); - ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); - ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); -} - static int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -118,7 +59,9 @@ static int xfrm6_output_one(struct sk_buff *skb) if (err) goto error; - xfrm6_encap(skb); + err = x->mode->output(skb); + if (err) + goto error; err = x->type->output(x, skb); if (err) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 44b64a593c01..b8936926c24b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -138,6 +138,89 @@ void xfrm_put_type(struct xfrm_type *type) module_put(type->owner); } +int xfrm_register_mode(struct xfrm_mode *mode, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode **modemap; + int err; + + if (unlikely(mode->encap >= XFRM_MODE_MAX)) + return -EINVAL; + + afinfo = xfrm_policy_lock_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + err = -EEXIST; + modemap = afinfo->mode_map; + if (likely(modemap[mode->encap] == NULL)) { + modemap[mode->encap] = mode; + err = 0; + } + + xfrm_policy_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_register_mode); + +int xfrm_unregister_mode(struct xfrm_mode *mode, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode **modemap; + int err; + + if (unlikely(mode->encap >= XFRM_MODE_MAX)) + return -EINVAL; + + afinfo = xfrm_policy_lock_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + err = -ENOENT; + modemap = afinfo->mode_map; + if (likely(modemap[mode->encap] == mode)) { + modemap[mode->encap] = NULL; + err = 0; + } + + xfrm_policy_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_unregister_mode); + +struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode *mode; + int modload_attempted = 0; + + if (unlikely(encap >= XFRM_MODE_MAX)) + return NULL; + +retry: + afinfo = xfrm_policy_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return NULL; + + mode = afinfo->mode_map[encap]; + if (unlikely(mode && !try_module_get(mode->owner))) + mode = NULL; + if (!mode && !modload_attempted) { + xfrm_policy_put_afinfo(afinfo); + request_module("xfrm-mode-%d-%d", family, encap); + modload_attempted = 1; + goto retry; + } + + xfrm_policy_put_afinfo(afinfo); + return mode; +} + +void xfrm_put_mode(struct xfrm_mode *mode) +{ + module_put(mode->owner); +} + static inline unsigned long make_jiffies(long secs) { if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ee62c239a7e3..17b29ec3c417 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -77,6 +77,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) kfree(x->ealg); kfree(x->calg); kfree(x->encap); + if (x->mode) + xfrm_put_mode(x->mode); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -1193,6 +1195,10 @@ int xfrm_init_state(struct xfrm_state *x) if (err) goto error; + x->mode = xfrm_get_mode(x->props.mode, family); + if (x->mode == NULL) + goto error; + x->km.state = XFRM_STATE_VALID; error: -- cgit v1.2.3 From 31a4ab93025719e62e7cf7ce899f71c34ecde5a0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 27 May 2006 23:06:13 -0700 Subject: [IPSEC] proto: Move transport mode input path into xfrm_mode_transport Now that we have xfrm_mode objects we can move the transport mode specific input decapsulation code into xfrm_mode_transport. This removes duplicate code as well as unnecessary header movement in case of tunnel mode SAs since we will discard the original IP header immediately. This also fixes a minor bug for transport-mode ESP where the IP payload length is set to the correct value minus the header length (with extension headers for IPv6). Of course the other neat thing is that we no longer have to allocate temporary buffers to hold the IP headers for ESP and IPComp. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 15 +++++++-------- net/ipv4/esp4.c | 18 ++++++------------ net/ipv4/ipcomp.c | 23 +++++------------------ net/ipv4/xfrm4_mode_transport.c | 14 ++++++++++++++ net/ipv6/ah6.c | 10 +++------- net/ipv6/esp6.c | 20 ++++---------------- net/ipv6/ipcomp6.c | 27 +++++---------------------- net/ipv6/xfrm6_mode_transport.c | 15 +++++++++++++++ 8 files changed, 59 insertions(+), 83 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index e2e4771fa4c6..c7782230080d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -119,6 +119,7 @@ error: static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; + int ihl; struct iphdr *iph; struct ip_auth_hdr *ah; struct ah_data *ahp; @@ -149,13 +150,14 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) ah = (struct ip_auth_hdr*)skb->data; iph = skb->nh.iph; - memcpy(work_buf, iph, iph->ihl*4); + ihl = skb->data - skb->nh.raw; + memcpy(work_buf, iph, ihl); iph->ttl = 0; iph->tos = 0; iph->frag_off = 0; iph->check = 0; - if (iph->ihl != 5) { + if (ihl > sizeof(*iph)) { u32 dummy; if (ip_clear_mutable_options(iph, &dummy)) goto out; @@ -164,7 +166,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) u8 auth_data[MAX_AH_AUTH_LEN]; memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, ihl); ahp->icv(ahp, skb, ah->auth_data); if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { x->stats.integrity_failed++; @@ -172,11 +174,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) } } ((struct iphdr*)work_buf)->protocol = ah->nexthdr; - skb->nh.raw = skb_pull(skb, ah_hlen); - memcpy(skb->nh.raw, work_buf, iph->ihl*4); - skb->nh.iph->tot_len = htons(skb->len); - skb_pull(skb, skb->nh.iph->ihl*4); - skb->h.raw = skb->data; + skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl); + __skb_pull(skb, ah_hlen + ihl); return 0; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9d1881c07a32..9bbdd4494551 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -143,10 +143,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) int alen = esp->auth.icv_trunc_len; int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; int nfrags; - int encap_len = 0; + int ihl; u8 nexthdr[2]; struct scatterlist *sg; - u8 workbuf[60]; int padlen; if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) @@ -177,7 +176,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; esph = (struct ip_esp_hdr*)skb->data; - iph = skb->nh.iph; /* Get ivec. This can be wrong, check against another impls. */ if (esp->conf.ivlen) @@ -204,12 +202,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) /* ... check padding bits here. Silly. :-) */ + iph = skb->nh.iph; + ihl = iph->ihl * 4; + if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; - struct udphdr *uh; - - uh = (struct udphdr *)(iph + 1); - encap_len = (void*)esph - (void*)uh; + struct udphdr *uh = (void *)(skb->nh.raw + ihl); /* * 1) if the NAT-T peer's IP or port changed then @@ -246,11 +244,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) iph->protocol = nexthdr[1]; pskb_trim(skb, skb->len - alen - padlen - 2); - memcpy(workbuf, skb->nh.raw, iph->ihl*4); - skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen); - skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen; - memcpy(skb->nh.raw, workbuf, iph->ihl*4); - skb->nh.iph->tot_len = htons(skb->len); + skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl; return 0; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 95278b22b669..8e243589045f 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -45,7 +45,6 @@ static LIST_HEAD(ipcomp_tfms_list); static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) { int err, plen, dlen; - struct iphdr *iph; struct ipcomp_data *ipcd = x->data; u8 *start, *scratch; struct crypto_tfm *tfm; @@ -74,8 +73,6 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) skb_put(skb, dlen - plen); memcpy(skb->data, scratch, dlen); - iph = skb->nh.iph; - iph->tot_len = htons(dlen + iph->ihl * 4); out: put_cpu(); return err; @@ -83,14 +80,9 @@ out: static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { - u8 nexthdr; int err = 0; struct iphdr *iph; - union { - struct iphdr iph; - char buf[60]; - } tmp_iph; - + struct ip_comp_hdr *ipch; if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && skb_linearize(skb, GFP_ATOMIC) != 0) { @@ -102,15 +94,10 @@ static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) /* Remove ipcomp header and decompress original payload */ iph = skb->nh.iph; - memcpy(&tmp_iph, iph, iph->ihl * 4); - nexthdr = *(u8 *)skb->data; - skb_pull(skb, sizeof(struct ip_comp_hdr)); - skb->nh.raw += sizeof(struct ip_comp_hdr); - memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4); - iph = skb->nh.iph; - iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr)); - iph->protocol = nexthdr; - skb->h.raw = skb->data; + ipch = (void *)skb->data; + iph->protocol = ipch->nexthdr; + skb->h.raw = skb->nh.raw + sizeof(*ipch); + __skb_pull(skb, sizeof(*ipch)); err = ipcomp_decompress(x, skb); out: diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index e46d9a4ccc55..a9e6b3dd19c9 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -38,8 +38,22 @@ static int xfrm4_transport_output(struct sk_buff *skb) return 0; } +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { + int ihl = skb->data - skb->h.raw; + + if (skb->h.raw != skb->nh.raw) + skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); + skb->nh.iph->tot_len = htons(skb->len + ihl); + skb->h.raw = skb->data; return 0; } diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 6778173a3dda..d31c0d6c0448 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -292,7 +292,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, hdr_len); ahp->icv(ahp, skb, ah->auth_data); if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n"); @@ -301,12 +301,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) } } - skb->nh.raw = skb_pull(skb, ah_hlen); - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); - skb_pull(skb, hdr_len); - skb->h.raw = skb->data; - + skb->h.raw = memcpy(skb->nh.raw += ah_hlen, tmp_hdr, hdr_len); + __skb_pull(skb, ah_hlen + hdr_len); kfree(tmp_hdr); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 22f046079037..a15a6f320f70 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -142,25 +142,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) int hdr_len = skb->h.raw - skb->nh.raw; int nfrags; - unsigned char *tmp_hdr = NULL; int ret = 0; if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) { ret = -EINVAL; - goto out_nofree; + goto out; } if (elen <= 0 || (elen & (blksize-1))) { ret = -EINVAL; - goto out_nofree; - } - - tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); - if (!tmp_hdr) { - ret = -ENOMEM; - goto out_nofree; + goto out; } - memcpy(tmp_hdr, skb->nh.raw, hdr_len); /* If integrity check is required, do this. */ if (esp->auth.icv_full_len) { @@ -222,16 +214,12 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) /* ... check padding bits here. Silly. :-) */ pskb_trim(skb, skb->len - alen - padlen - 2); - skb->h.raw = skb_pull(skb, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen); - skb->nh.raw += sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen; - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); ret = nexthdr[1]; } + skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - hdr_len; + out: - kfree(tmp_hdr); -out_nofree: return ret; } diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 48636436028a..cec3be544b69 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -66,10 +66,8 @@ static LIST_HEAD(ipcomp6_tfms_list); static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) { int err = 0; - u8 nexthdr = 0; - int hdr_len = skb->h.raw - skb->nh.raw; - unsigned char *tmp_hdr = NULL; struct ipv6hdr *iph; + struct ipv6_comp_hdr *ipch; int plen, dlen; struct ipcomp_data *ipcd = x->data; u8 *start, *scratch; @@ -86,17 +84,9 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) /* Remove ipcomp header and decompress original payload */ iph = skb->nh.ipv6h; - tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); - if (!tmp_hdr) - goto out; - memcpy(tmp_hdr, iph, hdr_len); - nexthdr = *(u8 *)skb->data; - skb_pull(skb, sizeof(struct ipv6_comp_hdr)); - skb->nh.raw += sizeof(struct ipv6_comp_hdr); - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - iph = skb->nh.ipv6h; - iph->payload_len = htons(ntohs(iph->payload_len) - sizeof(struct ipv6_comp_hdr)); - skb->h.raw = skb->data; + ipch = (void *)skb->data; + skb->h.raw = skb->nh.raw + sizeof(*ipch); + __skb_pull(skb, sizeof(*ipch)); /* decompression */ plen = skb->len; @@ -125,18 +115,11 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) skb_put(skb, dlen - plen); memcpy(skb->data, scratch, dlen); + err = ipch->nexthdr; - iph = skb->nh.ipv6h; - iph->payload_len = htons(skb->len); - out_put_cpu: put_cpu(); out: - kfree(tmp_hdr); - if (err) - goto error_out; - return nexthdr; -error_out: return err; } diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 5efbbae08ef0..711d713e36d8 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -42,8 +42,23 @@ static int xfrm6_transport_output(struct sk_buff *skb) return 0; } +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { + int ihl = skb->data - skb->h.raw; + + if (skb->h.raw != skb->nh.raw) + skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); + skb->nh.ipv6h->payload_len = htons(skb->len + ihl - + sizeof(struct ipv6hdr)); + skb->h.raw = skb->data; return 0; } -- cgit v1.2.3 From 3e72b2fe5b31791f976350b023b7a37ef59c02c1 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:19:19 -0700 Subject: [NETFILTER]: x_tables: remove some unnecessary casts Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_hashlimit.c | 2 +- net/netfilter/xt_connmark.c | 2 +- net/netfilter/xt_dccp.c | 3 +-- net/netfilter/xt_mark.c | 2 +- net/netfilter/xt_sctp.c | 4 +--- net/netfilter/xt_string.c | 2 +- 6 files changed, 6 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 7c6836c4646e..b88adc7f4b47 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -561,7 +561,7 @@ static void hashlimit_destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize) { - struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo; + struct ipt_hashlimit_info *r = matchinfo; htable_put(r->hinfo); } diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index dc26a27cbcaf..56324c8aff0a 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -58,7 +58,7 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { - struct xt_connmark_info *cm = (struct xt_connmark_info *)matchinfo; + struct xt_connmark_info *cm = matchinfo; if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { printk(KERN_WARNING "connmark: only support 32bit mark\n"); diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index dfb10b648e57..2e2f825dad4c 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -101,8 +101,7 @@ match(const struct sk_buff *skb, unsigned int protoff, int *hotdrop) { - const struct xt_dccp_info *info = - (const struct xt_dccp_info *)matchinfo; + const struct xt_dccp_info *info = matchinfo; struct dccp_hdr _dh, *dh; if (offset) diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 8b385a34886d..876bc5797738 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -42,7 +42,7 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { - struct xt_mark_info *minfo = (struct xt_mark_info *) matchinfo; + const struct xt_mark_info *minfo = matchinfo; if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { printk(KERN_WARNING "mark: only supports 32bit mark\n"); diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index 34bd87259a09..b5110e5b54b0 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -129,11 +129,9 @@ match(const struct sk_buff *skb, unsigned int protoff, int *hotdrop) { - const struct xt_sctp_info *info; + const struct xt_sctp_info *info = matchinfo; sctp_sctphdr_t _sh, *sh; - info = (const struct xt_sctp_info *)matchinfo; - if (offset) { duprintf("Dropping non-first fragment.. FIXME\n"); return 0; diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 79d9ea6964ba..0ebb6ac2c8c7 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -30,8 +30,8 @@ static int match(const struct sk_buff *skb, unsigned int protoff, int *hotdrop) { + const struct xt_string_info *conf = matchinfo; struct ts_state state; - struct xt_string_info *conf = (struct xt_string_info *) matchinfo; memset(&state, 0, sizeof(struct ts_state)); -- cgit v1.2.3 From 957dc80ac30f3c4d53259fa936df807663ba54fa Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:19:56 -0700 Subject: [NETFILTER]: x_tables: add SCTP/DCCP support where missing Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 20 ++++-------- net/ipv4/netfilter/ipt_hashlimit.c | 64 ++++++++++---------------------------- net/netfilter/xt_multiport.c | 7 +++-- 3 files changed, 26 insertions(+), 65 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index aad9d28c8d71..dbc83c5d7aa6 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -241,25 +241,17 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) struct iphdr *iph = skb->nh.iph; unsigned long hashval; u_int16_t sport, dport; - struct tcphdr *th; - struct udphdr *uh; - struct icmphdr *ih; + u_int16_t *ports; switch (iph->protocol) { case IPPROTO_TCP: - th = (void *)iph+iph->ihl*4; - sport = ntohs(th->source); - dport = ntohs(th->dest); - break; case IPPROTO_UDP: - uh = (void *)iph+iph->ihl*4; - sport = ntohs(uh->source); - dport = ntohs(uh->dest); - break; + case IPPROTO_SCTP: + case IPPROTO_DCCP: case IPPROTO_ICMP: - ih = (void *)iph+iph->ihl*4; - sport = ntohs(ih->un.echo.id); - dport = (ih->type<<8)|ih->code; + ports = (void *)iph+iph->ihl*4; + sport = ports[0]; + dport = ports[1]; break; default: if (net_ratelimit()) { diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index b88adc7f4b47..85edfb79469a 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -28,9 +28,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -381,49 +378,6 @@ static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) dh->rateinfo.credit = dh->rateinfo.credit_cap; } -static inline int get_ports(const struct sk_buff *skb, int offset, - u16 ports[2]) -{ - union { - struct tcphdr th; - struct udphdr uh; - sctp_sctphdr_t sctph; - } hdr_u, *ptr_u; - - /* Must not be a fragment. */ - if (offset) - return 1; - - /* Must be big enough to read ports (both UDP and TCP have - them at the start). */ - ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); - if (!ptr_u) - return 1; - - switch (skb->nh.iph->protocol) { - case IPPROTO_TCP: - ports[0] = ptr_u->th.source; - ports[1] = ptr_u->th.dest; - break; - case IPPROTO_UDP: - ports[0] = ptr_u->uh.source; - ports[1] = ptr_u->uh.dest; - break; - case IPPROTO_SCTP: - ports[0] = ptr_u->sctph.source; - ports[1] = ptr_u->sctph.dest; - break; - default: - /* all other protocols don't supprot per-port hash - * buckets */ - ports[0] = ports[1] = 0; - break; - } - - return 0; -} - - static int hashlimit_match(const struct sk_buff *skb, const struct net_device *in, @@ -449,8 +403,22 @@ hashlimit_match(const struct sk_buff *skb, dst.src_ip = skb->nh.iph->saddr; if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { - u_int16_t ports[2]; - if (get_ports(skb, offset, ports)) { + u_int16_t _ports[2], *ports; + + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + case IPPROTO_DCCP: + ports = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_ports), &_ports); + break; + default: + _ports[0] = _ports[1] = 0; + ports = _ports; + break; + } + if (!ports) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ *hotdrop = 1; diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index b56cd2baaac2..1ff0a25396e7 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -1,4 +1,4 @@ -/* Kernel module to match one of a list of TCP/UDP ports: ports are in +/* Kernel module to match one of a list of TCP/UDP/SCTP/DCCP ports: ports are in the same place so we can treat them as equal. */ /* (C) 1999-2001 Paul `Rusty' Russell @@ -160,8 +160,9 @@ check(u_int16_t proto, u_int8_t match_flags, u_int8_t count) { - /* Must specify proto == TCP/UDP, no unknown flags or bad count */ - return (proto == IPPROTO_TCP || proto == IPPROTO_UDP) + /* Must specify supported protocol, no unknown flags or bad count */ + return (proto == IPPROTO_TCP || proto == IPPROTO_UDP + || proto == IPPROTO_SCTP || proto == IPPROTO_DCCP) && !(ip_invflags & XT_INV_PROTO) && (match_flags == XT_MULTIPORT_SOURCE || match_flags == XT_MULTIPORT_DESTINATION -- cgit v1.2.3 From 404bdbfd242cb99ca0e9d3eb5fbb5bcd54123081 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:21:34 -0700 Subject: [NETFILTER]: recent match: replace by rewritten version Replace the unmaintainable ipt_recent match by a rewritten version that should be fully compatible. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 1268 ++++++++++++--------------------------- 1 file changed, 377 insertions(+), 891 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index b847ee409efb..9686c4d2057d 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -1,1007 +1,493 @@ -/* Kernel module to check if the source address has been seen recently. */ -/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ -/* Author: Stephen Frost */ -/* Project Page: http://snowman.net/projects/ipt_recent/ */ -/* This software is distributed under the terms of the GPL, Version 2 */ -/* This copyright does not cover user programs that use kernel services - * by normal system calls. */ - -#include -#include +/* + * Copyright (c) 2006 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is a replacement of the old ipt_recent module, which carried the + * following copyright notice: + * + * Author: Stephen Frost + * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org + */ +#include +#include #include -#include -#include -#include +#include +#include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include #include #include -#undef DEBUG -#define HASH_LOG 9 +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("IP tables recently seen matching module"); +MODULE_LICENSE("GPL"); -/* Defaults, these can be overridden on the module command-line. */ static unsigned int ip_list_tot = 100; static unsigned int ip_pkt_list_tot = 20; static unsigned int ip_list_hash_size = 0; static unsigned int ip_list_perms = 0644; -#ifdef DEBUG -static int debug = 1; -#endif - -static char version[] = -KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost . http://snowman.net/projects/ipt_recent/\n"; - -MODULE_AUTHOR("Stephen Frost "); -MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); -MODULE_LICENSE("GPL"); module_param(ip_list_tot, uint, 0400); module_param(ip_pkt_list_tot, uint, 0400); module_param(ip_list_hash_size, uint, 0400); module_param(ip_list_perms, uint, 0400); -#ifdef DEBUG -module_param(debug, bool, 0600); -MODULE_PARM_DESC(debug,"enable debugging output"); -#endif -MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); -MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); -MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); -MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); - -/* Structure of our list of recently seen addresses. */ -struct recent_ip_list { - u_int32_t addr; - u_int8_t ttl; - unsigned long last_seen; - unsigned long *last_pkts; - u_int32_t oldest_pkt; - u_int32_t hash_entry; - u_int32_t time_pos; -}; - -struct time_info_list { - u_int32_t position; - u_int32_t time; +MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); +MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); + + +struct recent_entry { + struct list_head list; + struct list_head lru_list; + u_int32_t addr; + u_int8_t ttl; + u_int8_t index; + u_int16_t nstamps; + unsigned long stamps[0]; }; -/* Structure of our linked list of tables of recent lists. */ -struct recent_ip_tables { - char name[IPT_RECENT_NAME_LEN]; - int count; - int time_pos; - struct recent_ip_list *table; - struct recent_ip_tables *next; - spinlock_t list_lock; - int *hash_table; - struct time_info_list *time_info; +struct recent_table { + struct list_head list; + char name[IPT_RECENT_NAME_LEN]; #ifdef CONFIG_PROC_FS - struct proc_dir_entry *status_proc; -#endif /* CONFIG_PROC_FS */ + struct proc_dir_entry *proc; +#endif + unsigned int refcnt; + unsigned int entries; + struct list_head lru_list; + struct list_head iphash[0]; }; -/* Our current list of addresses we have recently seen. - * Only added to on a --set, and only updated on --set || --update - */ -static struct recent_ip_tables *r_tables = NULL; - -/* We protect r_list with this spinlock so two processors are not modifying - * the list at the same time. - */ +static LIST_HEAD(tables); static DEFINE_SPINLOCK(recent_lock); #ifdef CONFIG_PROC_FS -/* Our /proc/net/ipt_recent entry */ -static struct proc_dir_entry *proc_net_ipt_recent = NULL; -#endif - -/* Function declaration for later. */ -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop); - -/* Function to hash a given address into the hash table of table_size size */ -static int hash_func(unsigned int addr, int table_size) -{ - int result = 0; - unsigned int value = addr; - do { result ^= value; } while((value >>= HASH_LOG)); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n", - result & (table_size - 1), - addr, - table_size); +static struct proc_dir_entry *proc_dir; +static struct file_operations recent_fops; #endif - return(result & (table_size - 1)); -} +static u_int32_t hash_rnd; +static int hash_rnd_initted; -#ifdef CONFIG_PROC_FS -/* This is the function which produces the output for our /proc output - * interface which lists each IP address, the last seen time and the - * other recent times the address was seen. - */ - -static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data) +static unsigned int recent_entry_hash(u_int32_t addr) { - int len = 0, count, last_len = 0, pkt_count; - off_t pos = 0; - off_t begin = 0; - struct recent_ip_tables *curr_table; - - curr_table = (struct recent_ip_tables*) data; - - spin_lock_bh(&curr_table->list_lock); - for(count = 0; count < ip_list_tot; count++) { - if(!curr_table->table[count].addr) continue; - last_len = len; - len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr)); - len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl); - len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen); - len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt); - len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]); - for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(!curr_table->table[count].last_pkts[pkt_count]) break; - len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]); - } - len += sprintf(buffer+len,"\n"); - pos = begin + len; - if(pos < offset) { len = 0; begin = pos; } - if(pos > offset + length) { len = last_len; break; } + if (!hash_rnd_initted) { + get_random_bytes(&hash_rnd, 4); + hash_rnd_initted = 1; } - - *start = buffer + (offset - begin); - len -= (offset - begin); - if(len > length) len = length; - - spin_unlock_bh(&curr_table->list_lock); - return len; + return jhash_1word(addr, hash_rnd) & (ip_list_hash_size - 1); } -/* ip_recent_ctrl provides an interface for users to modify the table - * directly. This allows adding entries, removing entries, and - * flushing the entire table. - * This is done by opening up the appropriate table for writing and - * sending one of: - * xx.xx.xx.xx -- Add entry to table with current time - * +xx.xx.xx.xx -- Add entry to table with current time - * -xx.xx.xx.xx -- Remove entry from table - * clear -- Flush table, remove all entries - */ - -static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data) +static struct recent_entry * +recent_entry_lookup(const struct recent_table *table, u_int32_t addr, u_int8_t ttl) { - static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; - u_int32_t val; - int base, used = 0; - char c, *cp; - union iaddr { - uint8_t bytes[4]; - uint32_t word; - } res; - uint8_t *pp = res.bytes; - int digit; - - char buffer[20]; - int len, check_set = 0, count; - u_int32_t addr = 0; - struct sk_buff *skb; - struct ipt_recent_info *info; - struct recent_ip_tables *curr_table; - - curr_table = (struct recent_ip_tables*) data; - - if(size > 20) len = 20; else len = size; - - if(copy_from_user(buffer,input,len)) return -EFAULT; - - if(len < 20) buffer[len] = '\0'; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer); -#endif + struct recent_entry *e; + unsigned int h; + + h = recent_entry_hash(addr); + list_for_each_entry(e, &table->iphash[h], list) + if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) + return e; + return NULL; +} - cp = buffer; - while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } +static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) +{ + list_del(&e->list); + list_del(&e->lru_list); + kfree(e); + t->entries--; +} - /* Check if we are asked to flush the entire table */ - if(!memcmp(cp,"clear",5)) { - used += 5; - spin_lock_bh(&curr_table->list_lock); - curr_table->time_pos = 0; - for(count = 0; count < ip_list_hash_size; count++) { - curr_table->hash_table[count] = -1; - } - for(count = 0; count < ip_list_tot; count++) { - curr_table->table[count].last_seen = 0; - curr_table->table[count].addr = 0; - curr_table->table[count].ttl = 0; - memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - curr_table->table[count].oldest_pkt = 0; - curr_table->table[count].time_pos = 0; - curr_table->time_info[count].position = count; - curr_table->time_info[count].time = 0; - } - spin_unlock_bh(&curr_table->list_lock); - return used; - } +static struct recent_entry * +recent_entry_init(struct recent_table *t, u_int32_t addr, u_int8_t ttl) +{ + struct recent_entry *e; - check_set = IPT_RECENT_SET; - switch(*cp) { - case '+': check_set = IPT_RECENT_SET; cp++; used++; break; - case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break; - default: if(!isdigit(*cp)) return (used+1); break; + if (t->entries >= ip_list_tot) { + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + recent_entry_remove(t, e); } + e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, + GFP_ATOMIC); + if (e == NULL) + return NULL; + e->addr = addr; + e->ttl = ttl; + e->stamps[0] = jiffies; + e->nstamps = 1; + e->index = 1; + list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); + list_add_tail(&e->lru_list, &t->lru_list); + t->entries++; + return e; +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); -#endif - /* Get addr (effectively inet_aton()) */ - /* Shamelessly stolen from libc, a function in the kernel for doing - * this would, of course, be greatly preferred, but our options appear - * to be rather limited, so we will just do it ourselves here. - */ - res.word = 0; - - c = *cp; - for(;;) { - if(!isdigit(c)) return used; - val = 0; base = 10; digit = 0; - if(c == '0') { - c = *++cp; - if(c == 'x' || c == 'X') base = 16, c = *++cp; - else { base = 8; digit = 1; } - } - for(;;) { - if(isascii(c) && isdigit(c)) { - if(base == 8 && (c == '8' || c == '0')) return used; - val = (val * base) + (c - '0'); - c = *++cp; - digit = 1; - } else if(base == 16 && isascii(c) && isxdigit(c)) { - val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A')); - c = *++cp; - digit = 1; - } else break; - } - if(c == '.') { - if(pp > res.bytes + 2 || val > 0xff) return used; - *pp++ = val; - c = *++cp; - } else break; - } - used = cp - buffer; - if(c != '\0' && (!isascii(c) || !isspace(c))) return used; - if(c == '\n') used++; - if(!digit) return used; +static void recent_entry_update(struct recent_table *t, struct recent_entry *e) +{ + e->stamps[e->index++] = jiffies; + if (e->index > e->nstamps) + e->nstamps = e->index; + e->index %= ip_pkt_list_tot; + list_move_tail(&e->lru_list, &t->lru_list); +} - if(val > max[pp - res.bytes]) return used; - addr = res.word | htonl(val); +static struct recent_table *recent_table_lookup(const char *name) +{ + struct recent_table *t; - if(!addr && check_set == IPT_RECENT_SET) return used; + list_for_each_entry(t, &tables, list) + if (!strcmp(t->name, name)) + return t; + return NULL; +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); -#endif +static void recent_table_flush(struct recent_table *t) +{ + struct recent_entry *e, *next; + unsigned int i; - /* Set up and just call match */ - info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); - if(!info) { return -ENOMEM; } - info->seconds = 0; - info->hit_count = 0; - info->check_set = check_set; - info->invert = 0; - info->side = IPT_RECENT_SOURCE; - strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN); - info->name[IPT_RECENT_NAME_LEN-1] = '\0'; - - skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL); - if (!skb) { - used = -ENOMEM; - goto out_free_info; - } - skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL); - if (!skb->nh.iph) { - used = -ENOMEM; - goto out_free_skb; + for (i = 0; i < ip_list_hash_size; i++) { + list_for_each_entry_safe(e, next, &t->iphash[i], list) + recent_entry_remove(t, e); } - - skb->nh.iph->saddr = addr; - skb->nh.iph->daddr = 0; - /* Clear ttl since we have no way of knowing it */ - skb->nh.iph->ttl = 0; - match(skb,NULL,NULL,NULL,info,0,0,NULL); - - kfree(skb->nh.iph); -out_free_skb: - kfree(skb); -out_free_info: - kfree(info); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used); -#endif - return used; } -#endif /* CONFIG_PROC_FS */ - -/* 'match' is our primary function, called by the kernel whenever a rule is - * hit with our module as an option to it. - * What this function does depends on what was specifically asked of it by - * the user: - * --set -- Add or update last seen time of the source address of the packet - * -- matchinfo->check_set == IPT_RECENT_SET - * --rcheck -- Just check if the source address is in the list - * -- matchinfo->check_set == IPT_RECENT_CHECK - * --update -- If the source address is in the list, update last_seen - * -- matchinfo->check_set == IPT_RECENT_UPDATE - * --remove -- If the source address is in the list, remove it - * -- matchinfo->check_set == IPT_RECENT_REMOVE - * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds - * -- matchinfo->seconds - * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times - * -- matchinfo->hit_count - * --seconds and --hitcount can be combined - */ static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +ipt_recent_match(const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + const struct xt_match *match, const void *matchinfo, + int offset, unsigned int protoff, int *hotdrop) { - int pkt_count, hits_found, ans; - unsigned long now; const struct ipt_recent_info *info = matchinfo; - u_int32_t addr = 0, time_temp; - u_int8_t ttl = skb->nh.iph->ttl; - int *hash_table; - int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; - struct time_info_list *time_info; - struct recent_ip_tables *curr_table; - struct recent_ip_tables *last_table; - struct recent_ip_list *r_list; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n"); -#endif - - /* Default is false ^ info->invert */ - ans = info->invert; + struct recent_table *t; + struct recent_entry *e; + u_int32_t addr; + u_int8_t ttl; + int ret = info->invert; -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); -#endif + if (info->side == IPT_RECENT_DEST) + addr = skb->nh.iph->daddr; + else + addr = skb->nh.iph->saddr; - /* if out != NULL then routing has been done and TTL changed. - * We change it back here internally for match what came in before routing. */ - if(out) ttl++; + ttl = skb->nh.iph->ttl; + /* use TTL as seen before forwarding */ + if (out && !skb->sk) + ttl++; - /* Find the right table */ spin_lock_bh(&recent_lock); - curr_table = r_tables; - while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); -#endif - - spin_unlock_bh(&recent_lock); - - /* Table with this name not found, match impossible */ - if(!curr_table) { return ans; } - - /* Make sure no one is changing the list while we work with it */ - spin_lock_bh(&curr_table->list_lock); - - r_list = curr_table->table; - if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr; - - if(!addr) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr); -#endif - spin_unlock_bh(&curr_table->list_lock); - return ans; + t = recent_table_lookup(info->name); + e = recent_entry_lookup(t, addr, + info->check_set & IPT_RECENT_TTL ? ttl : 0); + if (e == NULL) { + if (!(info->check_set & IPT_RECENT_SET)) + goto out; + e = recent_entry_init(t, addr, ttl); + if (e == NULL) + *hotdrop = 1; + ret ^= 1; + goto out; } -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl); -#endif - - /* Get jiffies now in case they changed while we were waiting for a lock */ - now = jiffies; - hash_table = curr_table->hash_table; - time_info = curr_table->time_info; - - orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size); - /* Hash entry at this result used */ - /* Check for TTL match if requested. If TTL is zero then a match would never - * happen, so match regardless of existing TTL in that case. Zero means the - * entry was added via the /proc interface anyway, so we will just use the - * first TTL we get for that IP address. */ - if(info->check_set & IPT_RECENT_TTL) { - while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr && - (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) { - /* Collision in hash table */ - hash_result = (hash_result + 1) % ip_list_hash_size; - } - } else { - while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) { - /* Collision in hash table */ - hash_result = (hash_result + 1) % ip_list_hash_size; - } - } - - if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) { - /* IP not in list and not asked to SET */ - spin_unlock_bh(&curr_table->list_lock); - return ans; - } - - /* Check if we need to handle the collision, do not need to on REMOVE */ - if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n", - orig_hash_result, - hash_result, - r_list[hash_table[orig_hash_result]].addr, - addr); -#endif - - /* We had a collision. - * orig_hash_result is where we started, hash_result is where we ended up. - * So, swap them because we are likely to see the same guy again sooner */ -#ifdef DEBUG - if(debug) { - printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]); - printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n", - r_list[hash_table[orig_hash_result]].hash_entry); - } -#endif - - r_list[hash_table[orig_hash_result]].hash_entry = hash_result; - - - temp = hash_table[orig_hash_result]; -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]); -#endif - hash_table[orig_hash_result] = hash_table[hash_result]; - hash_table[hash_result] = temp; - temp = hash_result; - hash_result = orig_hash_result; - orig_hash_result = temp; - time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result]; - if(hash_table[hash_result] != -1) { - r_list[hash_table[hash_result]].hash_entry = hash_result; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n"); -#endif - } - - if(hash_table[hash_result] == -1) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", - hash_result, addr); -#endif - - /* New item found and IPT_RECENT_SET, so we need to add it */ - location = time_info[curr_table->time_pos].position; - hash_table[r_list[location].hash_entry] = -1; - hash_table[hash_result] = location; - memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - r_list[location].time_pos = curr_table->time_pos; - r_list[location].addr = addr; - r_list[location].ttl = ttl; - r_list[location].last_seen = now; - r_list[location].oldest_pkt = 1; - r_list[location].last_pkts[0] = now; - r_list[location].hash_entry = hash_result; - time_info[curr_table->time_pos].time = r_list[location].last_seen; - curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot; - - ans = !info->invert; - } else { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n", - hash_result, - addr); -#endif - - /* Existing item found */ - location = hash_table[hash_result]; - /* We have a match on address, now to make sure it meets all requirements for a - * full match. */ - if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) { - if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert; - if(info->seconds && !info->hit_count) { - if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert; - } - if(info->seconds && info->hit_count) { - for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(r_list[location].last_pkts[pkt_count] == 0) break; - if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; - } - if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; - } - if(info->hit_count && !info->seconds) { - for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(r_list[location].last_pkts[pkt_count] == 0) break; - hits_found++; - } - if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; - } - } -#ifdef DEBUG - if(debug) { - if(ans) - printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr); - else - printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr); - } -#endif - - /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the - * current timestamp to the last_seen. */ - if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n"); -#endif - /* Have to update our time info */ - time_loc = r_list[location].time_pos; - time_info[time_loc].time = now; - time_info[time_loc].position = location; - while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { - time_temp = time_info[time_loc].time; - time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; - time_info[(time_loc+1)%ip_list_tot].time = time_temp; - time_temp = time_info[time_loc].position; - time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; - time_info[(time_loc+1)%ip_list_tot].position = time_temp; - r_list[time_info[time_loc].position].time_pos = time_loc; - r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; - time_loc = (time_loc+1) % ip_list_tot; - } - r_list[location].time_pos = time_loc; - r_list[location].ttl = ttl; - r_list[location].last_pkts[r_list[location].oldest_pkt] = now; - r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot; - r_list[location].last_seen = now; - } - /* If we have been asked to remove the entry from the list, just set it to 0 */ - if(info->check_set & IPT_RECENT_REMOVE) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result); -#endif - /* Check if this is part of a collision chain */ - while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) { - orig_hash_result++; - if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) { - /* Found collision chain, how deep does this rabbit hole go? */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n"); -#endif - end_collision_chain = orig_hash_result; - } - } - if(end_collision_chain != -1) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n"); -#endif - /* Part of a collision chain, swap it with the end of the chain - * before removing. */ - r_list[hash_table[end_collision_chain]].hash_entry = hash_result; - temp = hash_table[end_collision_chain]; - hash_table[end_collision_chain] = hash_table[hash_result]; - hash_table[hash_result] = temp; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - hash_result = end_collision_chain; - r_list[hash_table[hash_result]].hash_entry = hash_result; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - } - location = hash_table[hash_result]; - hash_table[r_list[location].hash_entry] = -1; - time_loc = r_list[location].time_pos; - time_info[time_loc].time = 0; - time_info[time_loc].position = location; - while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { - time_temp = time_info[time_loc].time; - time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; - time_info[(time_loc+1)%ip_list_tot].time = time_temp; - time_temp = time_info[time_loc].position; - time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; - time_info[(time_loc+1)%ip_list_tot].position = time_temp; - r_list[time_info[time_loc].position].time_pos = time_loc; - r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; - time_loc = (time_loc+1) % ip_list_tot; + if (info->check_set & IPT_RECENT_SET) + ret ^= 1; + else if (info->check_set & IPT_RECENT_REMOVE) { + recent_entry_remove(t, e); + ret ^= 1; + } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) { + unsigned long t = jiffies - info->seconds * HZ; + unsigned int i, hits = 0; + + for (i = 0; i < e->nstamps; i++) { + if (info->seconds && time_after(t, e->stamps[i])) + continue; + if (++hits >= info->hit_count) { + ret ^= 1; + break; } - r_list[location].time_pos = time_loc; - r_list[location].last_seen = 0; - r_list[location].addr = 0; - r_list[location].ttl = 0; - memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - r_list[location].oldest_pkt = 0; - ans = !info->invert; } - spin_unlock_bh(&curr_table->list_lock); - return ans; } - spin_unlock_bh(&curr_table->list_lock); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); -#endif - return ans; + if (info->check_set & IPT_RECENT_SET || + (info->check_set & IPT_RECENT_UPDATE && ret)) { + recent_entry_update(t, e); + e->ttl = ttl; + } +out: + spin_unlock_bh(&recent_lock); + return ret; } -/* This function is to verify that the rule given during the userspace iptables - * command is correct. - * If the command is valid then we check if the table name referred to by the - * rule exists, if not it is created. - */ static int -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) +ipt_recent_checkentry(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) { - int flag = 0, c; - unsigned long *hold; const struct ipt_recent_info *info = matchinfo; - struct recent_ip_tables *curr_table, *find_table, *last_table; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n"); -#endif - - /* seconds and hit_count only valid for CHECK/UPDATE */ - if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; } - if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; } - if(info->check_set & IPT_RECENT_CHECK) flag++; - if(info->check_set & IPT_RECENT_UPDATE) flag++; - - /* One and only one of these should ever be set */ - if(flag != 1) return 0; + struct recent_table *t; + unsigned i; + int ret = 0; - /* Name must be set to something */ - if(!info->name || !info->name[0]) return 0; + if (hweight8(info->check_set & + (IPT_RECENT_SET | IPT_RECENT_REMOVE | + IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1) + return 0; + if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) && + (info->seconds || info->hit_count)) + return 0; + if (info->name[0] == '\0' || + strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) + return 0; - /* Things look good, create a list for this if it does not exist */ - /* Lock the linked list while we play with it */ spin_lock_bh(&recent_lock); - - /* Look for an entry with this name already created */ - /* Finds the end of the list and the entry before the end if current name does not exist */ - find_table = r_tables; - while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); - - /* If a table already exists just increment the count on that table and return */ - if(find_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); -#endif - find_table->count++; - spin_unlock_bh(&recent_lock); - return 1; + t = recent_table_lookup(info->name); + if (t != NULL) { + t->refcnt++; + ret = 1; + goto out; } - spin_unlock_bh(&recent_lock); - - /* Table with this name not found */ - /* Allocate memory for new linked list item */ - -#ifdef DEBUG - if(debug) { - printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); - printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); + t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, + GFP_ATOMIC); + if (t == NULL) + goto out; + strcpy(t->name, info->name); + INIT_LIST_HEAD(&t->lru_list); + for (i = 0; i < ip_list_hash_size; i++) + INIT_LIST_HEAD(&t->iphash[i]); +#ifdef CONFIG_PROC_FS + t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir); + if (t->proc == NULL) { + kfree(t); + goto out; } + t->proc->proc_fops = &recent_fops; + t->proc->data = t; #endif + list_add_tail(&t->list, &tables); + ret = 1; +out: + spin_unlock_bh(&recent_lock); + return ret; +} - curr_table = vmalloc(sizeof(struct recent_ip_tables)); - if(curr_table == NULL) return 0; - - spin_lock_init(&curr_table->list_lock); - curr_table->next = NULL; - curr_table->count = 1; - curr_table->time_pos = 0; - strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN); - curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0'; - - /* Allocate memory for this table and the list of packets in each entry. */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n", - sizeof(struct recent_ip_list)*ip_list_tot, - info->name); -#endif - - curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot); - if(curr_table->table == NULL) { vfree(curr_table); return 0; } - memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", - sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); -#endif - - hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); -#endif - if(hold == NULL) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n"); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - for(c = 0; c < ip_list_tot; c++) { - curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot; - } +static void +ipt_recent_destroy(const struct xt_match *match, void *matchinfo, + unsigned int matchsize) +{ + const struct ipt_recent_info *info = matchinfo; + struct recent_table *t; - /* Allocate memory for the hash table */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", - sizeof(int)*ip_list_hash_size); + spin_lock_bh(&recent_lock); + t = recent_table_lookup(info->name); + if (--t->refcnt == 0) { + list_del(&t->list); + recent_table_flush(t); +#ifdef CONFIG_PROC_FS + remove_proc_entry(t->name, proc_dir); #endif - - curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size); - if(!curr_table->hash_table) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n"); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - - for(c = 0; c < ip_list_hash_size; c++) { - curr_table->hash_table[c] = -1; + kfree(t); } + spin_unlock_bh(&recent_lock); +} - /* Allocate memory for the time info */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", - sizeof(struct time_info_list)*ip_list_tot); -#endif +#ifdef CONFIG_PROC_FS +struct recent_iter_state { + struct recent_table *table; + unsigned int bucket; +}; - curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); - if(!curr_table->time_info) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); - vfree(curr_table->hash_table); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - for(c = 0; c < ip_list_tot; c++) { - curr_table->time_info[c].position = c; - curr_table->time_info[c].time = 0; - } +static void *recent_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + struct recent_table *t = st->table; + struct recent_entry *e; + loff_t p = *pos; - /* Put the new table in place */ spin_lock_bh(&recent_lock); - find_table = r_tables; - while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); - - /* If a table already exists just increment the count on that table and return */ - if(find_table) { - find_table->count++; - spin_unlock_bh(&recent_lock); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name); -#endif - vfree(curr_table->time_info); - vfree(curr_table->hash_table); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 1; - } - if(!last_table) r_tables = curr_table; else last_table->next = curr_table; - spin_unlock_bh(&recent_lock); - -#ifdef CONFIG_PROC_FS - /* Create our proc 'status' entry. */ - curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); - if (!curr_table->status_proc) { - vfree(hold); - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n"); - /* Destroy the created table */ - spin_lock_bh(&recent_lock); - last_table = NULL; - curr_table = r_tables; - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n"); -#endif - spin_unlock_bh(&recent_lock); - return 0; - } - while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n"); -#endif - spin_unlock_bh(&recent_lock); - return 0; + for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) { + list_for_each_entry(e, &t->iphash[st->bucket], list) { + if (p-- == 0) + return e; } - if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; - spin_unlock_bh(&recent_lock); - vfree(curr_table->time_info); - vfree(curr_table->hash_table); - vfree(curr_table->table); - vfree(curr_table); - return 0; } - - curr_table->status_proc->owner = THIS_MODULE; - curr_table->status_proc->data = curr_table; - wmb(); - curr_table->status_proc->read_proc = ip_recent_get_info; - curr_table->status_proc->write_proc = ip_recent_ctrl; -#endif /* CONFIG_PROC_FS */ - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n"); -#endif + return NULL; +} - return 1; +static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + struct recent_table *t = st->table; + struct recent_entry *e = v; + struct list_head *head = e->list.next; + + while (head == &t->iphash[st->bucket]) { + if (++st->bucket >= ip_list_hash_size) + return NULL; + head = t->iphash[st->bucket].next; + } + (*pos)++; + return list_entry(head, struct recent_entry, list); } -/* This function is called in the event that a rule matching this module is - * removed. - * When this happens we need to check if there are no other rules matching - * the table given. If that is the case then we remove the table and clean - * up its memory. - */ -static void -destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize) +static void recent_seq_stop(struct seq_file *s, void *v) { - const struct ipt_recent_info *info = matchinfo; - struct recent_ip_tables *curr_table, *last_table; + spin_unlock_bh(&recent_lock); +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); -#endif +static int recent_seq_show(struct seq_file *seq, void *v) +{ + struct recent_entry *e = v; + unsigned int i; + + i = (e->index - 1) % ip_pkt_list_tot; + seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", + NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); + for (i = 0; i < e->nstamps; i++) + seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); + seq_printf(seq, "\n"); + return 0; +} - if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; +static struct seq_operations recent_seq_ops = { + .start = recent_seq_start, + .next = recent_seq_next, + .stop = recent_seq_stop, + .show = recent_seq_show, +}; - /* Lock the linked list while we play with it */ - spin_lock_bh(&recent_lock); +static int recent_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct seq_file *seq; + struct recent_iter_state *st; + int ret; + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (st == NULL) + return -ENOMEM; + ret = seq_open(file, &recent_seq_ops); + if (ret) + kfree(st); + st->table = pde->data; + seq = file->private_data; + seq->private = st; + return ret; +} - /* Look for an entry with this name already created */ - /* Finds the end of the list and the entry before the end if current name does not exist */ - last_table = NULL; - curr_table = r_tables; - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); -#endif +static ssize_t recent_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *loff) +{ + struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode); + struct recent_table *t = pde->data; + struct recent_entry *e; + char buf[sizeof("+255.255.255.255")], *c = buf; + u_int32_t addr; + int add; + + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size)) + return -EFAULT; + while (isspace(*c)) + c++; + + if (size - (c - buf) < 5) + return c - buf; + if (!strncmp(c, "clear", 5)) { + c += 5; + spin_lock_bh(&recent_lock); + recent_table_flush(t); spin_unlock_bh(&recent_lock); - return; + return c - buf; } - while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); - /* If a table does not exist then do nothing and return */ - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); -#endif - spin_unlock_bh(&recent_lock); - return; + switch (*c) { + case '-': + add = 0; + c++; + break; + case '+': + c++; + default: + add = 1; + break; } + addr = in_aton(c); - curr_table->count--; - - /* If count is still non-zero then there are still rules referenceing it so we do nothing */ - if(curr_table->count) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); -#endif - spin_unlock_bh(&recent_lock); - return; + spin_lock_bh(&recent_lock); + e = recent_entry_lookup(t, addr, 0); + if (e == NULL) { + if (add) + recent_entry_init(t, addr, 0); + } else { + if (add) + recent_entry_update(t, e); + else + recent_entry_remove(t, e); } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n"); -#endif - - /* Count must be zero so we remove this table from the list */ - if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; - spin_unlock_bh(&recent_lock); + return size; +} - /* lock to make sure any late-runners still using this after we removed it from - * the list finish up then remove everything */ - spin_lock_bh(&curr_table->list_lock); - spin_unlock_bh(&curr_table->list_lock); - -#ifdef CONFIG_PROC_FS - if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); +static struct file_operations recent_fops = { + .open = recent_seq_open, + .read = seq_read, + .write = recent_proc_write, + .release = seq_release_private, + .owner = THIS_MODULE, +}; #endif /* CONFIG_PROC_FS */ - vfree(curr_table->table[0].last_pkts); - vfree(curr_table->table); - vfree(curr_table->hash_table); - vfree(curr_table->time_info); - vfree(curr_table); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n"); -#endif - return; -} - -/* This is the structure we pass to ipt_register to register our - * module with iptables. - */ static struct ipt_match recent_match = { .name = "recent", - .match = match, + .match = ipt_recent_match, .matchsize = sizeof(struct ipt_recent_info), - .checkentry = checkentry, - .destroy = destroy, - .me = THIS_MODULE + .checkentry = ipt_recent_checkentry, + .destroy = ipt_recent_destroy, + .me = THIS_MODULE, }; -/* Kernel module initialization. */ static int __init ipt_recent_init(void) { - int err, count; + int err; - printk(version); -#ifdef CONFIG_PROC_FS - proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); - if(!proc_net_ipt_recent) return -ENOMEM; -#endif - - if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) { - printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n"); - ip_list_hash_size = 0; - } - - if(!ip_list_hash_size) { - ip_list_hash_size = ip_list_tot*3; - count = 2*2; - while(ip_list_hash_size > count) count = count*2; - ip_list_hash_size = count; - } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size); -#endif + if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) + return -EINVAL; + ip_list_hash_size = 1 << fls(ip_list_tot); err = ipt_register_match(&recent_match); +#ifdef CONFIG_PROC_FS if (err) - remove_proc_entry("ipt_recent", proc_net); + return err; + proc_dir = proc_mkdir("ipt_recent", proc_net); + if (proc_dir == NULL) { + ipt_unregister_match(&recent_match); + err = -ENOMEM; + } +#endif return err; } -/* Kernel module destruction. */ -static void __exit ipt_recent_fini(void) +static void __exit ipt_recent_exit(void) { + BUG_ON(!list_empty(&tables)); ipt_unregister_match(&recent_match); - - remove_proc_entry("ipt_recent",proc_net); +#ifdef CONFIG_PROC_FS + remove_proc_entry("ipt_recent", proc_net); +#endif } -/* Register our module with the kernel. */ module_init(ipt_recent_init); -module_exit(ipt_recent_fini); +module_exit(ipt_recent_exit); -- cgit v1.2.3 From 6442f1cf897643d4ca597f2f7d3464b765bae960 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:21:53 -0700 Subject: [NETFILTER]: conntrack: don't call helpers for related ICMP messages None of the existing helpers expects to get called for related ICMP packets and some even drop them if they can't parse them. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_standalone.c | 2 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 929d61f7be91..f0cc7feb0da3 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -417,7 +417,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = ip_conntrack_get(*pskb, &ctinfo); - if (ct && ct->helper) { + if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) { unsigned int ret; ret = ct->helper->help(pskb, ct, ctinfo); if (ret != NF_ACCEPT) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 77d974443c7b..8cc8e1b36778 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -145,7 +145,7 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(*pskb, &ctinfo); - if (!ct) + if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) return NF_ACCEPT; help = nfct_help(ct); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 93bae36f2663..2a71c3b669f1 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -189,7 +189,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(*pskb, &ctinfo); - if (!ct) + if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) goto out; help = nfct_help(ct); -- cgit v1.2.3 From 39a27a35c5c1b5be499a0576a35c45a011788bf8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:23:54 -0700 Subject: [NETFILTER]: conntrack: add sysctl to disable checksumming Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_conntrack.h | 1 + include/linux/sysctl.h | 2 ++ include/net/netfilter/nf_conntrack.h | 1 + net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 2 +- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 2 +- net/ipv4/netfilter/ip_conntrack_proto_udp.c | 2 +- net/ipv4/netfilter/ip_conntrack_standalone.c | 11 +++++++++++ net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 2 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 5 +++-- net/netfilter/nf_conntrack_proto_udp.c | 3 ++- net/netfilter/nf_conntrack_standalone.c | 11 +++++++++++ 12 files changed, 36 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index d54d7b278e96..5473c01f69ea 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -293,6 +293,7 @@ static inline int is_dying(struct ip_conntrack *ct) } extern unsigned int ip_conntrack_htable_size; +extern int ip_conntrack_checksum; #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index cd9e7c0825ad..98338ed2c0b6 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -313,6 +313,7 @@ enum NET_NF_CONNTRACK_FRAG6_TIMEOUT=29, NET_NF_CONNTRACK_FRAG6_LOW_THRESH=30, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH=31, + NET_NF_CONNTRACK_CHECKSUM=32, }; /* /proc/sys/net/ipv4 */ @@ -492,6 +493,7 @@ enum NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25, NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26, NET_IPV4_NF_CONNTRACK_COUNT=27, + NET_IPV4_NF_CONNTRACK_CHECKSUM=28, }; /* /proc/sys/net/ipv6 */ diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 916013ca4a5c..dbe7a114d0c5 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -285,6 +285,7 @@ static inline int nf_ct_is_dying(struct nf_conn *ct) } extern unsigned int nf_conntrack_htable_size; +extern int nf_conntrack_checksum; #define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++) diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index d8b14a9010a6..23f1c504586d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, } /* See ip_conntrack_proto_tcp.c */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 062b252b58ad..c5c2ce5cdeb8 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb, * and moreover root might send raw packets. */ /* FIXME: Source route IP option packets --RR */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 70899868783b..9b2c16b4d2ff 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, * because the semantic of CHECKSUM_HW is different there * and moreover root might send raw packets. * FIXME: Source route IP option packets --RR */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { if (LOG_INVALID(IPPROTO_UDP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index f0cc7feb0da3..6cb9b989d14c 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -564,6 +564,8 @@ extern unsigned int ip_ct_generic_timeout; static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; +int ip_conntrack_checksum = 1; + static struct ctl_table_header *ip_ct_sysctl_header; static ctl_table ip_ct_sysctl_table[] = { @@ -591,6 +593,14 @@ static ctl_table ip_ct_sysctl_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, + .procname = "ip_conntrack_checksum", + .data = &ip_conntrack_checksum, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, .procname = "ip_conntrack_tcp_timeout_syn_sent", @@ -946,6 +956,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); +EXPORT_SYMBOL_GPL(ip_conntrack_checksum); #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 4b0d361cc6e6..663a73ee3f2f 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, } /* See ip_conntrack_proto_tcp.c */ - if (hooknum == NF_IP_PRE_ROUTING && + if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 86c6703265d0..ef18a7b7014b 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -233,7 +233,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff, return -NF_ACCEPT; } - if (hooknum == NF_IP6_PRE_ROUTING && + if (nf_conntrack_checksum && hooknum == NF_IP6_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, "nf_ct_icmpv6: ICMPv6 checksum failed\n"); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 69899f27d26a..12fb7c0a1509 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -828,8 +828,9 @@ static int tcp_error(struct sk_buff *skb, * and moreover root might send raw packets. */ /* FIXME: Source route IP option packets --RR */ - if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || - (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && + if (nf_conntrack_checksum && + ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index d93edbfde9e3..ae07ebe3ab37 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -134,7 +134,8 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff, * because the semantic of CHECKSUM_HW is different there * and moreover root might send raw packets. * FIXME: Source route IP option packets --RR */ - if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + if (nf_conntrack_checksum && + ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { if (LOG_INVALID(IPPROTO_UDP)) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 408960c6a544..e01d20d8e287 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -455,6 +455,8 @@ extern unsigned int nf_ct_generic_timeout; static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; +int nf_conntrack_checksum = 1; + static struct ctl_table_header *nf_ct_sysctl_header; static ctl_table nf_ct_sysctl_table[] = { @@ -482,6 +484,14 @@ static ctl_table nf_ct_sysctl_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_NF_CONNTRACK_CHECKSUM, + .procname = "nf_conntrack_checksum", + .data = &nf_conntrack_checksum, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, .procname = "nf_conntrack_tcp_timeout_syn_sent", @@ -851,6 +861,7 @@ EXPORT_SYMBOL(nf_ct_proto_put); EXPORT_SYMBOL(nf_ct_l3proto_find_get); EXPORT_SYMBOL(nf_ct_l3proto_put); EXPORT_SYMBOL(nf_ct_l3protos); +EXPORT_SYMBOL_GPL(nf_conntrack_checksum); EXPORT_SYMBOL(nf_conntrack_expect_alloc); EXPORT_SYMBOL(nf_conntrack_expect_put); EXPORT_SYMBOL(nf_conntrack_expect_related); -- cgit v1.2.3 From 997ae831ade74bdaed4172b1c02060b9efd6e206 Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Mon, 29 May 2006 18:24:20 -0700 Subject: [NETFILTER]: conntrack: add fixed timeout flag in connection tracking Add a flag in a connection status to have a non updated timeout. This permits to have connection that automatically die at a given time. Signed-off-by: Eric Leblond Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/nf_conntrack_common.h | 4 ++++ net/ipv4/netfilter/ip_conntrack_core.c | 6 ++++++ net/netfilter/nf_conntrack_core.c | 6 ++++++ 3 files changed, 16 insertions(+) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 3ff88c878308..d2e4bd7a7a14 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -69,6 +69,10 @@ enum ip_conntrack_status { /* Connection is dying (removed from lists), can not be unset. */ IPS_DYING_BIT = 9, IPS_DYING = (1 << IPS_DYING_BIT), + + /* Connection has fixed timeout. */ + IPS_FIXED_TIMEOUT_BIT = 10, + IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT), }; /* Connection tracking event bits */ diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index a297da7bbef5..4fe9e69378df 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1130,6 +1130,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct, write_lock_bh(&ip_conntrack_lock); + /* Only update if this is not a fixed timeout */ + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + write_unlock_bh(&ip_conntrack_lock); + return; + } + /* If not in hash table, timer will not be active yet */ if (!is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f9b83f91371a..bc2bd4c3859e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1396,6 +1396,12 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, write_lock_bh(&nf_conntrack_lock); + /* Only update if this is not a fixed timeout */ + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + write_unlock_bh(&nf_conntrack_lock); + return; + } + /* If not in hash table, timer will not be active yet */ if (!nf_ct_is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; -- cgit v1.2.3 From 3726add76643c715d437aceda320d319153b6113 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:24:39 -0700 Subject: [NETFILTER]: ctnetlink: fix NAT configuration The current configuration only allows to configure one manip and overloads conntrack status flags with netlink semantic. Signed-off-by: Patrick Mchardy Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink_conntrack.h | 4 +- net/ipv4/netfilter/ip_conntrack_netlink.c | 53 +++++++++++---------------- net/netfilter/nf_conntrack_netlink.c | 53 +++++++++++---------------- 3 files changed, 47 insertions(+), 63 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 668ec946c8e2..b5883ccee295 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -27,13 +27,15 @@ enum ctattr_type { CTA_STATUS, CTA_PROTOINFO, CTA_HELP, - CTA_NAT, + CTA_NAT_SRC, +#define CTA_NAT CTA_NAT_SRC /* backwards compatibility */ CTA_TIMEOUT, CTA_MARK, CTA_COUNTERS_ORIG, CTA_COUNTERS_REPLY, CTA_USE, CTA_ID, + CTA_NAT_DST, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 01bd7cab9367..af152e3623dc 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -629,7 +629,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = { }; static inline int -ctnetlink_parse_nat(struct nfattr *cda[], +ctnetlink_parse_nat(struct nfattr *nat, const struct ip_conntrack *ct, struct ip_nat_range *range) { struct nfattr *tb[CTA_NAT_MAX]; @@ -639,7 +639,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], memset(range, 0, sizeof(*range)); - nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + nfattr_parse_nested(tb, CTA_NAT_MAX, nat); if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) return -EINVAL; @@ -854,39 +854,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) /* ASSURED bit can only be set */ return -EINVAL; - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { #ifndef CONFIG_IP_NF_NAT_NEEDED return -EINVAL; #else - unsigned int hooknum; struct ip_nat_range range; - if (ctnetlink_parse_nat(cda, ct, &range) < 0) - return -EINVAL; - - DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", - NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), - htons(range.min.all), htons(range.max.all)); - - /* This is tricky but it works. ip_nat_setup_info needs the - * hook number as parameter, so let's do the correct - * conversion and run away */ - if (status & IPS_SRC_NAT_DONE) - hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ - else if (status & IPS_DST_NAT_DONE) - hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ - else - return -EINVAL; /* Missing NAT flags */ - - DEBUGP("NAT status: %lu\n", - status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - - if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) - return -EEXIST; - ip_nat_setup_info(ct, &range, hooknum); - - DEBUGP("NAT status after setup_info: %lu\n", - ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + if (cda[CTA_NAT_DST-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_PRE_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); + } + if (cda[CTA_NAT_SRC-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_POST_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + } #endif } @@ -1106,7 +1097,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, /* implicit 'else' */ /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { err = -EINVAL; goto out_unlock; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index bd10eb944b65..8f27fe9446f2 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -641,7 +641,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = { }; static inline int -ctnetlink_parse_nat(struct nfattr *cda[], +ctnetlink_parse_nat(struct nfattr *nat, const struct nf_conn *ct, struct ip_nat_range *range) { struct nfattr *tb[CTA_NAT_MAX]; @@ -651,7 +651,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], memset(range, 0, sizeof(*range)); - nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + nfattr_parse_nested(tb, CTA_NAT_MAX, nat); if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) return -EINVAL; @@ -866,39 +866,30 @@ ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[]) /* ASSURED bit can only be set */ return -EINVAL; - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { #ifndef CONFIG_IP_NF_NAT_NEEDED return -EINVAL; #else - unsigned int hooknum; struct ip_nat_range range; - if (ctnetlink_parse_nat(cda, ct, &range) < 0) - return -EINVAL; - - DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", - NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), - htons(range.min.all), htons(range.max.all)); - - /* This is tricky but it works. ip_nat_setup_info needs the - * hook number as parameter, so let's do the correct - * conversion and run away */ - if (status & IPS_SRC_NAT_DONE) - hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ - else if (status & IPS_DST_NAT_DONE) - hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ - else - return -EINVAL; /* Missing NAT flags */ - - DEBUGP("NAT status: %lu\n", - status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - - if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) - return -EEXIST; - ip_nat_setup_info(ct, &range, hooknum); - - DEBUGP("NAT status after setup_info: %lu\n", - ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + if (cda[CTA_NAT_DST-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_PRE_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + } + if (cda[CTA_NAT_SRC-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_POST_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + } #endif } @@ -1122,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, /* implicit 'else' */ /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { err = -EINVAL; goto out_unlock; } -- cgit v1.2.3 From 89f2e21883b59a6ff1e64d0b4924d06b1c6101ba Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:24:58 -0700 Subject: [NETFILTER]: ctnetlink: change table dumping not to require an unique ID Instead of using the ID to find out where to continue dumping, take a reference to the last entry dumped and try to continue there. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 32 +++++++++++++++++++++++-------- net/netfilter/nf_conntrack_netlink.c | 32 +++++++++++++++++++++++-------- 2 files changed, 48 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index af152e3623dc..33891bb1fde4 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -399,38 +399,54 @@ nfattr_failure: static int ctnetlink_done(struct netlink_callback *cb) { DEBUGP("entered %s\n", __FUNCTION__); + if (cb->args[1]) + ip_conntrack_put((struct ip_conntrack *)cb->args[1]); return 0; } static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct ip_conntrack *ct = NULL; + struct ip_conntrack *ct, *last; struct ip_conntrack_tuple_hash *h; struct list_head *i; - u_int32_t *id = (u_int32_t *) &cb->args[1]; DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, cb->args[0], *id); read_lock_bh(&ip_conntrack_lock); - for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) { +restart: + last = (struct ip_conntrack *)cb->args[1]; list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = tuplehash_to_ctrack(h); - if (ct->id <= *id) - continue; + if (last != NULL) { + if (ct == last) { + ip_conntrack_put(last); + cb->args[1] = 0; + last = NULL; + } else + continue; + } if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, IPCTNL_MSG_CT_NEW, - 1, ct) < 0) + 1, ct) < 0) { + nf_conntrack_get(&ct->ct_general); + cb->args[1] = (unsigned long)ct; goto out; - *id = ct->id; + } + } + if (last != NULL) { + ip_conntrack_put(last); + cb->args[1] = 0; + goto restart; } } -out: +out: read_unlock_bh(&ip_conntrack_lock); DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 8f27fe9446f2..b8c7c567c9df 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -407,6 +407,8 @@ nfattr_failure: static int ctnetlink_done(struct netlink_callback *cb) { + if (cb->args[1]) + nf_ct_put((struct nf_conn *)cb->args[1]); DEBUGP("entered %s\n", __FUNCTION__); return 0; } @@ -416,10 +418,9 @@ static int ctnetlink_done(struct netlink_callback *cb) static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct nf_conn *ct = NULL; + struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; struct list_head *i; - u_int32_t *id = (u_int32_t *) &cb->args[1]; struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; @@ -427,7 +428,9 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) cb->args[0], *id); read_lock_bh(&nf_conntrack_lock); - for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) { + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { +restart: + last = (struct nf_conn *)cb->args[1]; list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { h = (struct nf_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) @@ -438,17 +441,30 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) * then dump everything. */ if (l3proto && L3PROTO(ct) != l3proto) continue; - if (ct->id <= *id) - continue; + if (last != NULL) { + if (ct == last) { + nf_ct_put(last); + cb->args[1] = 0; + last = NULL; + } else + continue; + } if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, IPCTNL_MSG_CT_NEW, - 1, ct) < 0) + 1, ct) < 0) { + nf_conntrack_get(&ct->ct_general); + cb->args[1] = (unsigned long)ct; goto out; - *id = ct->id; + } + } + if (last != NULL) { + nf_ct_put(last); + cb->args[1] = 0; + goto restart; } } -out: +out: read_unlock_bh(&nf_conntrack_lock); DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); -- cgit v1.2.3 From 695ecea3299dba2239d1cb4fd4d4e4c95a5b9ce7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:25:14 -0700 Subject: [NETFILTER]: SNMP helper: fix debug module param type debug is the debug level, not a bool. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_nat_snmp_basic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index c33244263b90..d20d557f915a 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1348,4 +1348,4 @@ static void __exit ip_nat_snmp_basic_fini(void) module_init(ip_nat_snmp_basic_init); module_exit(ip_nat_snmp_basic_fini); -module_param(debug, bool, 0600); +module_param(debug, int, 0600); -- cgit v1.2.3 From 7d8c50181778b6ba10c2bba9a2f22db9493bb245 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:25:38 -0700 Subject: [NETFILTER]: FTP helper: search optimization Instead of skipping search entries for the wrong direction simply index them by direction. Based on patch by Pablo Neira Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_ftp.c | 77 +++++++++++++++++++---------------- net/netfilter/nf_conntrack_ftp.c | 77 +++++++++++++++++++---------------- 2 files changed, 86 insertions(+), 68 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 3e542bf28a9d..4dcf526c3944 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -56,37 +56,48 @@ static int try_eprt(const char *, size_t, u_int32_t [], char); static int try_epsv_response(const char *, size_t, u_int32_t [], char); static const struct ftp_search { - enum ip_conntrack_dir dir; const char *pattern; size_t plen; char skip; char term; enum ip_ct_ftp_type ftptype; int (*getnum)(const char *, size_t, u_int32_t[], char); -} search[] = { - { - IP_CT_DIR_ORIGINAL, - "PORT", sizeof("PORT") - 1, ' ', '\r', - IP_CT_FTP_PORT, - try_rfc959, +} search[IP_CT_DIR_MAX][2] = { + [IP_CT_DIR_ORIGINAL] = { + { + .pattern = "PORT", + .plen = sizeof("PORT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_PORT, + .getnum = try_rfc959, + }, + { + .pattern = "EPRT", + .plen = sizeof("EPRT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_EPRT, + .getnum = try_eprt, + }, }, - { - IP_CT_DIR_REPLY, - "227 ", sizeof("227 ") - 1, '(', ')', - IP_CT_FTP_PASV, - try_rfc959, - }, - { - IP_CT_DIR_ORIGINAL, - "EPRT", sizeof("EPRT") - 1, ' ', '\r', - IP_CT_FTP_EPRT, - try_eprt, - }, - { - IP_CT_DIR_REPLY, - "229 ", sizeof("229 ") - 1, '(', ')', - IP_CT_FTP_EPSV, - try_epsv_response, + [IP_CT_DIR_REPLY] = { + { + .pattern = "227 ", + .plen = sizeof("227 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_PASV, + .getnum = try_rfc959, + }, + { + .pattern = "229 ", + .plen = sizeof("229 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_EPSV, + .getnum = try_epsv_response, + }, }, }; @@ -346,17 +357,15 @@ static int help(struct sk_buff **pskb, array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; - for (i = 0; i < ARRAY_SIZE(search); i++) { - if (search[i].dir != dir) continue; - + for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { found = find_pattern(fb_ptr, (*pskb)->len - dataoff, - search[i].pattern, - search[i].plen, - search[i].skip, - search[i].term, + search[dir][i].pattern, + search[dir][i].plen, + search[dir][i].skip, + search[dir][i].term, &matchoff, &matchlen, array, - search[i].getnum); + search[dir][i].getnum); if (found) break; } if (found == -1) { @@ -366,7 +375,7 @@ static int help(struct sk_buff **pskb, this case. */ if (net_ratelimit()) printk("conntrack_ftp: partial %s %u+%u\n", - search[i].pattern, + search[dir][i].pattern, ntohl(th->seq), datalen); ret = NF_DROP; goto out; @@ -426,7 +435,7 @@ static int help(struct sk_buff **pskb, /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ if (ip_nat_ftp_hook) - ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + ret = ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, matchoff, matchlen, exp, &seq); else { /* Can't expect this? Best to drop packet now. */ diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index e38a4b5a3089..11d3be243536 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -67,37 +67,48 @@ static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, char); static struct ftp_search { - enum ip_conntrack_dir dir; const char *pattern; size_t plen; char skip; char term; enum ip_ct_ftp_type ftptype; int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); -} search[] = { - { - IP_CT_DIR_ORIGINAL, - "PORT", sizeof("PORT") - 1, ' ', '\r', - IP_CT_FTP_PORT, - try_rfc959, +} search[IP_CT_DIR_MAX][2] = { + [IP_CT_DIR_ORIGINAL] = { + { + .pattern = "PORT", + .plen = sizeof("PORT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_PORT, + .getnum = try_rfc959, + }, + { + .pattern = "EPRT", + .plen = sizeof("EPRT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_EPRT, + .getnum = try_eprt, + }, }, - { - IP_CT_DIR_REPLY, - "227 ", sizeof("227 ") - 1, '(', ')', - IP_CT_FTP_PASV, - try_rfc959, - }, - { - IP_CT_DIR_ORIGINAL, - "EPRT", sizeof("EPRT") - 1, ' ', '\r', - IP_CT_FTP_EPRT, - try_eprt, - }, - { - IP_CT_DIR_REPLY, - "229 ", sizeof("229 ") - 1, '(', ')', - IP_CT_FTP_EPSV, - try_epsv_response, + [IP_CT_DIR_REPLY] = { + { + .pattern = "227 ", + .plen = sizeof("227 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_PASV, + .getnum = try_rfc959, + }, + { + .pattern = "229 ", + .plen = sizeof("229 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_EPSV, + .getnum = try_epsv_response, + }, }, }; @@ -492,17 +503,15 @@ static int help(struct sk_buff **pskb, memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, sizeof(cmd.u3.all)); - for (i = 0; i < ARRAY_SIZE(search); i++) { - if (search[i].dir != dir) continue; - + for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { found = find_pattern(fb_ptr, datalen, - search[i].pattern, - search[i].plen, - search[i].skip, - search[i].term, + search[dir][i].pattern, + search[dir][i].plen, + search[dir][i].skip, + search[dir][i].term, &matchoff, &matchlen, &cmd, - search[i].getnum); + search[dir][i].getnum); if (found) break; } if (found == -1) { @@ -512,7 +521,7 @@ static int help(struct sk_buff **pskb, this case. */ if (net_ratelimit()) printk("conntrack_ftp: partial %s %u+%u\n", - search[i].pattern, + search[dir][i].pattern, ntohl(th->seq), datalen); ret = NF_DROP; goto out; @@ -597,7 +606,7 @@ static int help(struct sk_buff **pskb, /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ if (nf_nat_ftp_hook) - ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + ret = nf_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, matchoff, matchlen, exp, &seq); else { /* Can't expect this? Best to drop packet now. */ -- cgit v1.2.3 From c95261693467f0aeac7fafa69860ddfb02bc12f8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:25:58 -0700 Subject: [NETFILTER]: amanda helper: convert to textsearch infrastructure When a port number within a packet is replaced by a differently sized number only the packet is resized, but not the copy of the data. Following port numbers are rewritten based on their offsets within the copy, leading to packet corruption. Convert the amanda helper to the textsearch infrastructure to avoid the copy entirely. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 2 + net/ipv4/netfilter/ip_conntrack_amanda.c | 143 ++++++++++++++++++++----------- 2 files changed, 96 insertions(+), 49 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d4072533da21..1540e227890f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -142,6 +142,8 @@ config IP_NF_TFTP config IP_NF_AMANDA tristate "Amanda backup protocol support" depends on IP_NF_CONNTRACK + select TEXTSEARCH + select TEXTSEARCH_KMP help If you are running the Amanda backup package on this machine or machines that will be MASQUERADED through this diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index a604b1ccfdaa..0a7bd7f04061 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -17,33 +17,29 @@ * this value. * */ - -#include #include #include -#include -#include #include +#include +#include +#include +#include #include -#include -#include +#include #include #include static unsigned int master_timeout = 300; +static char *ts_algo = "kmp"; MODULE_AUTHOR("Brian J. Murrell "); MODULE_DESCRIPTION("Amanda connection tracking module"); MODULE_LICENSE("GPL"); module_param(master_timeout, uint, 0600); MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); - -static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; - -/* This is slow, but it's simple. --RR */ -static char *amanda_buffer; -static DEFINE_SPINLOCK(amanda_buffer_lock); +module_param(ts_algo, charp, 0400); +MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, @@ -52,12 +48,48 @@ unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, struct ip_conntrack_expect *exp); EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); +enum amanda_strings { + SEARCH_CONNECT, + SEARCH_NEWLINE, + SEARCH_DATA, + SEARCH_MESG, + SEARCH_INDEX, +}; + +static struct { + char *string; + size_t len; + struct ts_config *ts; +} search[] = { + [SEARCH_CONNECT] = { + .string = "CONNECT ", + .len = 8, + }, + [SEARCH_NEWLINE] = { + .string = "\n", + .len = 1, + }, + [SEARCH_DATA] = { + .string = "DATA ", + .len = 5, + }, + [SEARCH_MESG] = { + .string = "MESG ", + .len = 5, + }, + [SEARCH_INDEX] = { + .string = "INDEX ", + .len = 6, + }, +}; + static int help(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) { + struct ts_state ts; struct ip_conntrack_expect *exp; - char *data, *data_limit, *tmp; - unsigned int dataoff, i; + unsigned int dataoff, start, stop, off, i; + char pbuf[sizeof("65535")], *tmp; u_int16_t port, len; int ret = NF_ACCEPT; @@ -77,29 +109,34 @@ static int help(struct sk_buff **pskb, return NF_ACCEPT; } - spin_lock_bh(&amanda_buffer_lock); - skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); - data = amanda_buffer; - data_limit = amanda_buffer + (*pskb)->len - dataoff; - *data_limit = '\0'; - - /* Search for the CONNECT string */ - data = strstr(data, "CONNECT "); - if (!data) + memset(&ts, 0, sizeof(ts)); + start = skb_find_text(*pskb, dataoff, (*pskb)->len, + search[SEARCH_CONNECT].ts, &ts); + if (start == UINT_MAX) goto out; - data += strlen("CONNECT "); + start += dataoff + search[SEARCH_CONNECT].len; - /* Only search first line. */ - if ((tmp = strchr(data, '\n'))) - *tmp = '\0'; + memset(&ts, 0, sizeof(ts)); + stop = skb_find_text(*pskb, start, (*pskb)->len, + search[SEARCH_NEWLINE].ts, &ts); + if (stop == UINT_MAX) + goto out; + stop += start; - for (i = 0; i < ARRAY_SIZE(conns); i++) { - char *match = strstr(data, conns[i]); - if (!match) + for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { + memset(&ts, 0, sizeof(ts)); + off = skb_find_text(*pskb, start, stop, search[i].ts, &ts); + if (off == UINT_MAX) continue; - tmp = data = match + strlen(conns[i]); - port = simple_strtoul(data, &data, 10); - len = data - tmp; + off += start + search[i].len; + + len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off); + if (skb_copy_bits(*pskb, off, pbuf, len)) + break; + pbuf[len] = '\0'; + + port = simple_strtoul(pbuf, &tmp, 10); + len = tmp - pbuf; if (port == 0 || len > 5) break; @@ -125,8 +162,7 @@ static int help(struct sk_buff **pskb, exp->mask.dst.u.tcp.port = 0xFFFF; if (ip_nat_amanda_hook) - ret = ip_nat_amanda_hook(pskb, ctinfo, - tmp - amanda_buffer, + ret = ip_nat_amanda_hook(pskb, ctinfo, off - dataoff, len, exp); else if (ip_conntrack_expect_related(exp) != 0) ret = NF_DROP; @@ -134,12 +170,11 @@ static int help(struct sk_buff **pskb, } out: - spin_unlock_bh(&amanda_buffer_lock); return ret; } static struct ip_conntrack_helper amanda_helper = { - .max_expected = ARRAY_SIZE(conns), + .max_expected = 3, .timeout = 180, .me = THIS_MODULE, .help = help, @@ -155,26 +190,36 @@ static struct ip_conntrack_helper amanda_helper = { static void __exit ip_conntrack_amanda_fini(void) { + int i; + ip_conntrack_helper_unregister(&amanda_helper); - kfree(amanda_buffer); + for (i = 0; i < ARRAY_SIZE(search); i++) + textsearch_destroy(search[i].ts); } static int __init ip_conntrack_amanda_init(void) { - int ret; - - amanda_buffer = kmalloc(65536, GFP_KERNEL); - if (!amanda_buffer) - return -ENOMEM; - - ret = ip_conntrack_helper_register(&amanda_helper); - if (ret < 0) { - kfree(amanda_buffer); - return ret; + int ret, i; + + ret = -ENOMEM; + for (i = 0; i < ARRAY_SIZE(search); i++) { + search[i].ts = textsearch_prepare(ts_algo, search[i].string, + search[i].len, + GFP_KERNEL, TS_AUTOLOAD); + if (search[i].ts == NULL) + goto err; } + ret = ip_conntrack_helper_register(&amanda_helper); + if (ret < 0) + goto err; return 0; - +err: + for (; i >= 0; i--) { + if (search[i].ts) + textsearch_destroy(search[i].ts); + } + return ret; } module_init(ip_conntrack_amanda_init); -- cgit v1.2.3 From c0d4cfd96dd0cc0dbf49435898808b5553af4822 Mon Sep 17 00:00:00 2001 From: Jing Min Zhao Date: Mon, 29 May 2006 18:26:27 -0700 Subject: [NETFILTER]: H.323 helper: Add support for Call Forwarding Signed-off-by: Jing Min Zhao Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_conntrack.h | 1 + include/linux/netfilter_ipv4/ip_conntrack_h323.h | 7 ++ .../ip_conntrack_helper_h323_types.h | 3 +- net/ipv4/netfilter/Kconfig | 8 +- net/ipv4/netfilter/ip_conntrack_helper_h323.c | 112 +++++++++++++++++++++ .../netfilter/ip_conntrack_helper_h323_types.c | 6 +- net/ipv4/netfilter/ip_nat_helper_h323.c | 77 ++++++++++++++ 7 files changed, 206 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index 5473c01f69ea..17d7ef938a09 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -154,6 +154,7 @@ struct ip_conntrack_expect unsigned int flags; #ifdef CONFIG_IP_NF_NAT_NEEDED + u_int32_t saved_ip; /* This is the original per-proto part, used to map the * expected connection the way the recipient expects. */ union ip_conntrack_manip_proto saved_proto; diff --git a/include/linux/netfilter_ipv4/ip_conntrack_h323.h b/include/linux/netfilter_ipv4/ip_conntrack_h323.h index eace86bd2adb..3cbff7379002 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_h323.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_h323.h @@ -71,6 +71,13 @@ extern int (*nat_h245_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct, unsigned char **data, int dataoff, TransportAddress * addr, u_int16_t port, struct ip_conntrack_expect * exp); +extern int (*nat_callforwarding_hook) (struct sk_buff ** pskb, + struct ip_conntrack * ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr, + u_int16_t port, + struct ip_conntrack_expect * exp); extern int (*nat_q931_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct, enum ip_conntrack_info ctinfo, unsigned char **data, TransportAddress * addr, diff --git a/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h b/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h index cc98f7aa5abe..3d4a773799fc 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h @@ -1,4 +1,4 @@ -/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006 +/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006 * * Copyright (c) 2006 Jing Min Zhao * @@ -412,6 +412,7 @@ typedef struct Facility_UUIE { /* SEQUENCE */ eFacility_UUIE_destinationInfo = (1 << 14), eFacility_UUIE_h245SecurityMode = (1 << 13), } options; + TransportAddress alternativeAddress; FacilityReason reason; TransportAddress h245Address; Facility_UUIE_fastStart fastStart; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 1540e227890f..f86aeda3a5a0 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -183,10 +183,10 @@ config IP_NF_H323 With this module you can support H.323 on a connection tracking/NAT firewall. - This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP - and T.120 based data and applications including audio, video, FAX, - chat, whiteboard, file transfer, etc. For more information, please - see http://nath323.sourceforge.net/. + This module supports RAS, Fast Start, H.245 Tunnelling, Call + Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat, + whiteboard, file transfer, etc. For more information, please + visit http://nath323.sourceforge.net/. If you want to compile it as a module, say 'M' here and read Documentation/modules.txt. If unsure, say 'N'. diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index 518f581d39ec..3052468a6ab1 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #if 0 #define DEBUGP printk @@ -38,6 +40,13 @@ static int gkrouted_only = 1; module_param(gkrouted_only, int, 0600); MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); +static char *internal_net = NULL; +static u_int32_t internal_net_addr = 0; +static u_int32_t internal_net_mask = 0; +module_param(internal_net, charp, 0600); +MODULE_PARM_DESC(internal_net, "specify your internal network using format " + "address/mask. this is used by call forwarding support"); + /* Hooks for NAT */ int (*set_h245_addr_hook) (struct sk_buff ** pskb, unsigned char **data, int dataoff, @@ -77,6 +86,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb, unsigned char **data, int dataoff, TransportAddress * addr, u_int16_t port, struct ip_conntrack_expect * exp); +int (*nat_callforwarding_hook) (struct sk_buff ** pskb, + struct ip_conntrack * ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr, u_int16_t port, + struct ip_conntrack_expect * exp); int (*nat_q931_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct, enum ip_conntrack_info ctinfo, @@ -683,6 +698,76 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct, return ret; } +/* Forwarding declaration */ +void ip_conntrack_q931_expect(struct ip_conntrack *new, + struct ip_conntrack_expect *this); + +/****************************************************************************/ +static int expect_callforwarding(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + u_int32_t ip; + u_int16_t port; + struct ip_conntrack_expect *exp = NULL; + + /* Read alternativeAddress */ + if (!get_h225_addr(*data, addr, &ip, &port) || port == 0) + return 0; + + /* If the calling party is on the same side of the forward-to party, + * we don't need to track the second call */ + if (internal_net && + ((ip & internal_net_mask) == internal_net_addr) == + ((ct->tuplehash[!dir].tuple.src.ip & internal_net_mask) == + internal_net_addr)) { + DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); + return 0; + } + + /* Create expect for the second call leg */ + if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) + return -1; + exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; + exp->tuple.src.u.tcp.port = 0; + exp->tuple.dst.ip = ip; + exp->tuple.dst.u.tcp.port = htons(port); + exp->tuple.dst.protonum = IPPROTO_TCP; + exp->mask.src.ip = 0xFFFFFFFF; + exp->mask.src.u.tcp.port = 0; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.u.tcp.port = 0xFFFF; + exp->mask.dst.protonum = 0xFF; + exp->flags = 0; + + if (ct->tuplehash[dir].tuple.src.ip != + ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) { + /* Need NAT */ + ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff, + addr, port, exp); + } else { /* Conntrack only */ + exp->expectfn = ip_conntrack_q931_expect; + + if (ip_conntrack_expect_related(exp) == 0) { + DEBUGP("ip_ct_q931: expect Call Forwarding " + "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", + NIPQUAD(exp->tuple.src.ip), + ntohs(exp->tuple.src.u.tcp.port), + NIPQUAD(exp->tuple.dst.ip), + ntohs(exp->tuple.dst.u.tcp.port)); + } else + ret = -1; + } + + ip_conntrack_expect_put(exp); + + return ret; +} + /****************************************************************************/ static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, @@ -878,6 +963,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct, DEBUGP("ip_ct_q931: Facility\n"); + if (facility->reason.choice == eFacilityReason_callForwarded) { + if (facility->options & eFacility_UUIE_alternativeAddress) + return expect_callforwarding(pskb, ct, ctinfo, data, + dataoff, + &facility-> + alternativeAddress); + return 0; + } + if (facility->options & eFacility_UUIE_h245Address) { ret = expect_h245(pskb, ct, ctinfo, data, dataoff, &facility->h245Address); @@ -1668,6 +1762,7 @@ static void fini(void) static int __init init(void) { int ret; + char *p; h323_buffer = kmalloc(65536, GFP_KERNEL); if (!h323_buffer) @@ -1678,6 +1773,22 @@ static int __init init(void) return ret; } + if (internal_net) { + if ((p = strchr(internal_net, '/'))) + *p++ = 0; + if (isdigit(internal_net[0])) { + internal_net_addr = in_aton(internal_net); + if (p && isdigit(p[0])) + internal_net_mask = in_aton(p); + else + internal_net_mask = 0xffffffff; + internal_net_addr &= internal_net_mask; + } + DEBUGP("ip_ct_h323: internal_net = %u.%u.%u.%u/%u.%u.%u.%u\n", + NIPQUAD(internal_net_addr), + NIPQUAD(internal_net_mask)); + } + DEBUGP("ip_ct_h323: init success\n"); return 0; } @@ -1696,6 +1807,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook); EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); EXPORT_SYMBOL_GPL(nat_t120_hook); EXPORT_SYMBOL_GPL(nat_h245_hook); +EXPORT_SYMBOL_GPL(nat_callforwarding_hook); EXPORT_SYMBOL_GPL(nat_q931_hook); MODULE_AUTHOR("Jing Min Zhao "); diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c index 022c47b9f6c9..4b359618bedd 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c @@ -1,4 +1,4 @@ -/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006 +/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006 * * Copyright (c) 2006 Jing Min Zhao * @@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */ static field_t _Facility_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, - {FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0, - _TransportAddress}, + {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Facility_UUIE, alternativeAddress), _TransportAddress}, {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, _Facility_UUIE_alternativeAliasAddress}, {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c index d45663d137a7..419b878fb467 100644 --- a/net/ipv4/netfilter/ip_nat_helper_h323.c +++ b/net/ipv4/netfilter/ip_nat_helper_h323.c @@ -486,6 +486,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct, return 0; } +/****************************************************************************/ +static void ip_nat_callforwarding_expect(struct ip_conntrack *new, + struct ip_conntrack_expect *this) +{ + struct ip_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(new->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip; + + /* hook doesn't matter, but it has to do source manip */ + ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = this->saved_proto; + range.min_ip = range.max_ip = this->saved_ip; + + /* hook doesn't matter, but it has to do destination manip */ + ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); + + ip_conntrack_q931_expect(new, this); +} + +/****************************************************************************/ +static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr, u_int16_t port, + struct ip_conntrack_expect *exp) +{ + int dir = CTINFO2DIR(ctinfo); + u_int16_t nated_port; + + /* Set expectations for NAT */ + exp->saved_ip = exp->tuple.dst.ip; + exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->expectfn = ip_nat_callforwarding_expect; + exp->dir = !dir; + + /* Try to get same port: if not, try to change it. */ + for (nated_port = port; nated_port != 0; nated_port++) { + exp->tuple.dst.u.tcp.port = htons(nated_port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (nated_port == 0) { /* No port available */ + if (net_ratelimit()) + printk("ip_nat_q931: out of TCP ports\n"); + return 0; + } + + /* Modify signal */ + if (!set_h225_addr(pskb, data, dataoff, addr, + ct->tuplehash[!dir].tuple.dst.ip, + nated_port) == 0) { + ip_conntrack_unexpect_related(exp); + return -1; + } + + /* Success */ + DEBUGP("ip_nat_q931: expect Call Forwarding " + "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", + NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), + NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); + + return 0; +} + /****************************************************************************/ static int __init init(void) { @@ -496,6 +570,7 @@ static int __init init(void) BUG_ON(nat_rtp_rtcp_hook != NULL); BUG_ON(nat_t120_hook != NULL); BUG_ON(nat_h245_hook != NULL); + BUG_ON(nat_callforwarding_hook != NULL); BUG_ON(nat_q931_hook != NULL); set_h245_addr_hook = set_h245_addr; @@ -505,6 +580,7 @@ static int __init init(void) nat_rtp_rtcp_hook = nat_rtp_rtcp; nat_t120_hook = nat_t120; nat_h245_hook = nat_h245; + nat_callforwarding_hook = nat_callforwarding; nat_q931_hook = nat_q931; DEBUGP("ip_nat_h323: init success\n"); @@ -521,6 +597,7 @@ static void __exit fini(void) nat_rtp_rtcp_hook = NULL; nat_t120_hook = NULL; nat_h245_hook = NULL; + nat_callforwarding_hook = NULL; nat_q931_hook = NULL; synchronize_net(); } -- cgit v1.2.3 From e44ab66a75e20c02193440a5e27c16c91630109b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:26:47 -0700 Subject: [NETFILTER]: H.323 helper: replace internal_net_addr parameter by routing-based heuristic Call Forwarding doesn't need to create an expectation if both peers can reach each other without our help. The internal_net_addr parameter lets the user explicitly specify a single network where this is true, but is not very flexible and even fails in the common case that calls will both be forwarded to outside parties and inside parties. Use an optional heuristic based on routing instead, the assumption is that if bpth the outgoing device and the gateway are equal, both peers can reach each other directly. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_helper_h323.c | 57 +++++++++++++-------------- 1 file changed, 27 insertions(+), 30 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index 3052468a6ab1..0665674218c6 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c @@ -40,12 +40,11 @@ static int gkrouted_only = 1; module_param(gkrouted_only, int, 0600); MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); -static char *internal_net = NULL; -static u_int32_t internal_net_addr = 0; -static u_int32_t internal_net_mask = 0; -module_param(internal_net, charp, 0600); -MODULE_PARM_DESC(internal_net, "specify your internal network using format " - "address/mask. this is used by call forwarding support"); +static int callforward_filter = 1; +module_param(callforward_filter, bool, 0600); +MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " + "if both endpoints are on different sides " + "(determined by routing information)"); /* Hooks for NAT */ int (*set_h245_addr_hook) (struct sk_buff ** pskb, @@ -721,12 +720,28 @@ static int expect_callforwarding(struct sk_buff **pskb, /* If the calling party is on the same side of the forward-to party, * we don't need to track the second call */ - if (internal_net && - ((ip & internal_net_mask) == internal_net_addr) == - ((ct->tuplehash[!dir].tuple.src.ip & internal_net_mask) == - internal_net_addr)) { - DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); - return 0; + if (callforward_filter) { + struct rtable *rt1, *rt2; + struct flowi fl1 = { + .fl4_dst = ip, + }; + struct flowi fl2 = { + .fl4_dst = ct->tuplehash[!dir].tuple.src.ip, + }; + + if (ip_route_output_key(&rt1, &fl1) == 0) { + if (ip_route_output_key(&rt2, &fl2) == 0) { + if (rt1->rt_gateway == rt2->rt_gateway && + rt1->u.dst.dev == rt2->u.dst.dev) + ret = 1; + dst_release(&rt2->u.dst); + } + dst_release(&rt1->u.dst); + } + if (ret) { + DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); + return 0; + } } /* Create expect for the second call leg */ @@ -1762,7 +1777,6 @@ static void fini(void) static int __init init(void) { int ret; - char *p; h323_buffer = kmalloc(65536, GFP_KERNEL); if (!h323_buffer) @@ -1772,23 +1786,6 @@ static int __init init(void) fini(); return ret; } - - if (internal_net) { - if ((p = strchr(internal_net, '/'))) - *p++ = 0; - if (isdigit(internal_net[0])) { - internal_net_addr = in_aton(internal_net); - if (p && isdigit(p[0])) - internal_net_mask = in_aton(p); - else - internal_net_mask = 0xffffffff; - internal_net_addr &= internal_net_mask; - } - DEBUGP("ip_ct_h323: internal_net = %u.%u.%u.%u/%u.%u.%u.%u\n", - NIPQUAD(internal_net_addr), - NIPQUAD(internal_net_mask)); - } - DEBUGP("ip_ct_h323: init success\n"); return 0; } -- cgit v1.2.3 From ae5b7d8ba2c28d7d9835856fe0ca5f6ec95ea768 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:27:09 -0700 Subject: [NETFILTER]: Add SIP connection tracking helper Add SIP connection tracking helper. Originally written by Christian Hentschel , some cleanup, minor fixes and bidirectional SIP support added by myself. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_conntrack_sip.h | 44 +++ net/ipv4/netfilter/Kconfig | 18 + net/ipv4/netfilter/Makefile | 2 + net/ipv4/netfilter/ip_conntrack_sip.c | 471 ++++++++++++++++++++++++ net/ipv4/netfilter/ip_nat_sip.c | 249 +++++++++++++ 5 files changed, 784 insertions(+) create mode 100644 include/linux/netfilter_ipv4/ip_conntrack_sip.h create mode 100644 net/ipv4/netfilter/ip_conntrack_sip.c create mode 100644 net/ipv4/netfilter/ip_nat_sip.c (limited to 'net/ipv4') diff --git a/include/linux/netfilter_ipv4/ip_conntrack_sip.h b/include/linux/netfilter_ipv4/ip_conntrack_sip.h new file mode 100644 index 000000000000..913dad66c0fb --- /dev/null +++ b/include/linux/netfilter_ipv4/ip_conntrack_sip.h @@ -0,0 +1,44 @@ +#ifndef __IP_CONNTRACK_SIP_H__ +#define __IP_CONNTRACK_SIP_H__ +#ifdef __KERNEL__ + +#define SIP_PORT 5060 +#define SIP_TIMEOUT 3600 + +#define POS_VIA 0 +#define POS_CONTACT 1 +#define POS_CONTENT 2 +#define POS_MEDIA 3 +#define POS_OWNER 4 +#define POS_CONNECTION 5 +#define POS_REQ_HEADER 6 +#define POS_SDP_HEADER 7 + +struct sip_header_nfo { + const char *lname; + const char *sname; + const char *ln_str; + size_t lnlen; + size_t snlen; + size_t ln_strlen; + int (*match_len)(const char *, const char *, int *); +}; + +extern unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr); +extern unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp, + const char *dptr); + +extern int ct_sip_get_info(const char *dptr, size_t dlen, + unsigned int *matchoff, + unsigned int *matchlen, + struct sip_header_nfo *hnfo); +extern int ct_sip_lnlen(const char *line, const char *limit); +extern const char *ct_sip_search(const char *needle, const char *haystack, + size_t needle_len, size_t haystack_len); +#endif /* __KERNEL__ */ +#endif /* __IP_CONNTRACK_SIP_H__ */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index f86aeda3a5a0..ff4b118f14a9 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -191,6 +191,18 @@ config IP_NF_H323 If you want to compile it as a module, say 'M' here and read Documentation/modules.txt. If unsure, say 'N'. +config IP_NF_SIP + tristate "SIP protocol support (EXPERIMENTAL)" + depends on IP_NF_CONNTRACK && EXPERIMENTAL + help + SIP is an application-layer control protocol that can establish, + modify, and terminate multimedia sessions (conferences) such as + Internet telephony calls. With the ip_conntrack_sip and + the ip_nat_sip modules you can support the protocol on a connection + tracking/NATing firewall. + + To compile it as a module, choose M here. If unsure, say Y. + config IP_NF_QUEUE tristate "IP Userspace queueing via NETLINK (OBSOLETE)" help @@ -503,6 +515,12 @@ config IP_NF_NAT_H323 default IP_NF_NAT if IP_NF_H323=y default m if IP_NF_H323=m +config IP_NF_NAT_SIP + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_SIP=y + default m if IP_NF_SIP=m + # mangle + specific targets config IP_NF_MANGLE tristate "Packet mangling" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 461cb1eb5de7..3ded4a3af59c 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o +obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o # NAT helpers @@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o +obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c new file mode 100644 index 000000000000..fc87ce0da40d --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_sip.c @@ -0,0 +1,471 @@ +/* SIP extension for IP connection tracking. + * + * (C) 2005 by Christian Hentschel + * based on RR's ip_conntrack_ftp.c and other modules. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel "); +MODULE_DESCRIPTION("SIP connection tracking helper"); + +#define MAX_PORTS 8 +static unsigned short ports[MAX_PORTS]; +static int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of sip servers"); + +static unsigned int sip_timeout = SIP_TIMEOUT; +module_param(sip_timeout, uint, 0600); +MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session"); + +unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr); +EXPORT_SYMBOL_GPL(ip_nat_sip_hook); + +unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp, + const char *dptr); +EXPORT_SYMBOL_GPL(ip_nat_sdp_hook); + +int ct_sip_get_info(const char *dptr, size_t dlen, + unsigned int *matchoff, + unsigned int *matchlen, + struct sip_header_nfo *hnfo); +EXPORT_SYMBOL_GPL(ct_sip_get_info); + + +static int digits_len(const char *dptr, const char *limit, int *shift); +static int epaddr_len(const char *dptr, const char *limit, int *shift); +static int skp_digits_len(const char *dptr, const char *limit, int *shift); +static int skp_epaddr_len(const char *dptr, const char *limit, int *shift); + +struct sip_header_nfo ct_sip_hdrs[] = { + { /* Via header */ + .lname = "Via:", + .lnlen = sizeof("Via:") - 1, + .sname = "\r\nv:", + .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */ + .ln_str = "UDP ", + .ln_strlen = sizeof("UDP ") - 1, + .match_len = epaddr_len, + }, + { /* Contact header */ + .lname = "Contact:", + .lnlen = sizeof("Contact:") - 1, + .sname = "\r\nm:", + .snlen = sizeof("\r\nm:") - 1, + .ln_str = "sip:", + .ln_strlen = sizeof("sip:") - 1, + .match_len = skp_epaddr_len + }, + { /* Content length header */ + .lname = "Content-Length:", + .lnlen = sizeof("Content-Length:") - 1, + .sname = "\r\nl:", + .snlen = sizeof("\r\nl:") - 1, + .ln_str = ":", + .ln_strlen = sizeof(":") - 1, + .match_len = skp_digits_len + }, + { /* SDP media info */ + .lname = "\nm=", + .lnlen = sizeof("\nm=") - 1, + .sname = "\rm=", + .snlen = sizeof("\rm=") - 1, + .ln_str = "audio ", + .ln_strlen = sizeof("audio ") - 1, + .match_len = digits_len + }, + { /* SDP owner address*/ + .lname = "\no=", + .lnlen = sizeof("\no=") - 1, + .sname = "\ro=", + .snlen = sizeof("\ro=") - 1, + .ln_str = "IN IP4 ", + .ln_strlen = sizeof("IN IP4 ") - 1, + .match_len = epaddr_len + }, + { /* SDP connection info */ + .lname = "\nc=", + .lnlen = sizeof("\nc=") - 1, + .sname = "\rc=", + .snlen = sizeof("\rc=") - 1, + .ln_str = "IN IP4 ", + .ln_strlen = sizeof("IN IP4 ") - 1, + .match_len = epaddr_len + }, + { /* Requests headers */ + .lname = "sip:", + .lnlen = sizeof("sip:") - 1, + .sname = "sip:", + .snlen = sizeof("sip:") - 1, /* yes, i know.. ;) */ + .ln_str = "@", + .ln_strlen = sizeof("@") - 1, + .match_len = epaddr_len + }, + { /* SDP version header */ + .lname = "\nv=", + .lnlen = sizeof("\nv=") - 1, + .sname = "\rv=", + .snlen = sizeof("\rv=") - 1, + .ln_str = "=", + .ln_strlen = sizeof("=") - 1, + .match_len = digits_len + } +}; +EXPORT_SYMBOL_GPL(ct_sip_hdrs); + +/* get line lenght until first CR or LF seen. */ +int ct_sip_lnlen(const char *line, const char *limit) +{ + const char *k = line; + + while ((line <= limit) && (*line == '\r' || *line == '\n')) + line++; + + while (line <= limit) { + if (*line == '\r' || *line == '\n') + break; + line++; + } + return line - k; +} +EXPORT_SYMBOL_GPL(ct_sip_lnlen); + +/* Linear string search, case sensitive. */ +const char *ct_sip_search(const char *needle, const char *haystack, + size_t needle_len, size_t haystack_len) +{ + const char *limit = haystack + (haystack_len - needle_len); + + while (haystack <= limit) { + if (memcmp(haystack, needle, needle_len) == 0) + return haystack; + haystack++; + } + return NULL; +} +EXPORT_SYMBOL_GPL(ct_sip_search); + +static int digits_len(const char *dptr, const char *limit, int *shift) +{ + int len = 0; + while (dptr <= limit && isdigit(*dptr)) { + dptr++; + len++; + } + return len; +} + +/* get digits lenght, skiping blank spaces. */ +static int skp_digits_len(const char *dptr, const char *limit, int *shift) +{ + for (; dptr <= limit && *dptr == ' '; dptr++) + (*shift)++; + + return digits_len(dptr, limit, shift); +} + +/* Simple ipaddr parser.. */ +static int parse_ipaddr(const char *cp, const char **endp, + u_int32_t *ipaddr, const char *limit) +{ + unsigned long int val; + int i, digit = 0; + + for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) { + digit = 0; + if (!isdigit(*cp)) + break; + + val = simple_strtoul(cp, (char **)&cp, 10); + if (val > 0xFF) + return -1; + + ((u_int8_t *)ipaddr)[i] = val; + digit = 1; + + if (*cp != '.') + break; + cp++; + } + if (!digit) + return -1; + + if (endp) + *endp = cp; + + return 0; +} + +/* skip ip address. returns it lenght. */ +static int epaddr_len(const char *dptr, const char *limit, int *shift) +{ + const char *aux = dptr; + u_int32_t ip; + + if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) { + DEBUGP("ip: %s parse failed.!\n", dptr); + return 0; + } + + /* Port number */ + if (*dptr == ':') { + dptr++; + dptr += digits_len(dptr, limit, shift); + } + return dptr - aux; +} + +/* get address length, skiping user info. */ +static int skp_epaddr_len(const char *dptr, const char *limit, int *shift) +{ + int s = *shift; + + for (; dptr <= limit && *dptr != '@'; dptr++) + (*shift)++; + + if (*dptr == '@') { + dptr++; + (*shift)++; + } else + *shift = s; + + return epaddr_len(dptr, limit, shift); +} + +/* Returns 0 if not found, -1 error parsing. */ +int ct_sip_get_info(const char *dptr, size_t dlen, + unsigned int *matchoff, + unsigned int *matchlen, + struct sip_header_nfo *hnfo) +{ + const char *limit, *aux, *k = dptr; + int shift = 0; + + limit = dptr + (dlen - hnfo->lnlen); + + while (dptr <= limit) { + if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) && + (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) { + dptr++; + continue; + } + aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen, + ct_sip_lnlen(dptr, limit)); + if (!aux) { + DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str, + hnfo->lname); + return -1; + } + aux += hnfo->ln_strlen; + + *matchlen = hnfo->match_len(aux, limit, &shift); + if (!*matchlen) + return -1; + + *matchoff = (aux - k) + shift; + + DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname, + *matchlen); + return 1; + } + DEBUGP("%s header not found.\n", hnfo->lname); + return 0; +} + +static int set_expected_rtp(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u_int32_t ipaddr, u_int16_t port, + const char *dptr) +{ + struct ip_conntrack_expect *exp; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + int ret; + + exp = ip_conntrack_expect_alloc(ct); + if (exp == NULL) + return NF_DROP; + + exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; + exp->tuple.src.u.udp.port = 0; + exp->tuple.dst.ip = ipaddr; + exp->tuple.dst.u.udp.port = htons(port); + exp->tuple.dst.protonum = IPPROTO_UDP; + + exp->mask.src.ip = 0xFFFFFFFF; + exp->mask.src.u.udp.port = 0; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.u.udp.port = 0xFFFF; + exp->mask.dst.protonum = 0xFF; + + exp->expectfn = NULL; + exp->flags = 0; + + if (ip_nat_sdp_hook) + ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr); + else { + if (ip_conntrack_expect_related(exp) != 0) + ret = NF_DROP; + else + ret = NF_ACCEPT; + } + ip_conntrack_expect_put(exp); + + return ret; +} + +static int sip_help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + const char *dptr; + int ret = NF_ACCEPT; + int matchoff, matchlen; + u_int32_t ipaddr; + u_int16_t port; + + /* No Data ? */ + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + if (dataoff >= (*pskb)->len) { + DEBUGP("skb->len = %u\n", (*pskb)->len); + return NF_ACCEPT; + } + + ip_ct_refresh(ct, *pskb, sip_timeout * HZ); + + if (!skb_is_nonlinear(*pskb)) + dptr = (*pskb)->data + dataoff; + else { + DEBUGP("Copy of skbuff not supported yet.\n"); + goto out; + } + + if (ip_nat_sip_hook) { + if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) { + ret = NF_DROP; + goto out; + } + } + + /* After this point NAT, could have mangled skb, so + we need to recalculate payload lenght. */ + datalen = (*pskb)->len - dataoff; + + if (datalen < (sizeof("SIP/2.0 200") - 1)) + goto out; + + /* RTP info only in some SDP pkts */ + if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 && + memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) { + goto out; + } + /* Get ip and port address from SDP packet. */ + if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, + &ct_sip_hdrs[POS_CONNECTION]) > 0) { + + /* We'll drop only if there are parse problems. */ + if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr, + dptr + datalen) < 0) { + ret = NF_DROP; + goto out; + } + if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, + &ct_sip_hdrs[POS_MEDIA]) > 0) { + + port = simple_strtoul(dptr + matchoff, NULL, 10); + if (port < 1024) { + ret = NF_DROP; + goto out; + } + ret = set_expected_rtp(pskb, ct, ctinfo, + ipaddr, port, dptr); + } + } +out: + return ret; +} + +static struct ip_conntrack_helper sip[MAX_PORTS]; +static char sip_names[MAX_PORTS][10]; + +static void fini(void) +{ + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("unregistering helper for port %d\n", ports[i]); + ip_conntrack_helper_unregister(&sip[i]); + } +} + +static int __init init(void) +{ + int i, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = SIP_PORT; + + for (i = 0; i < ports_c; i++) { + /* Create helper structure */ + memset(&sip[i], 0, sizeof(struct ip_conntrack_helper)); + + sip[i].tuple.dst.protonum = IPPROTO_UDP; + sip[i].tuple.src.u.udp.port = htons(ports[i]); + sip[i].mask.src.u.udp.port = 0xFFFF; + sip[i].mask.dst.protonum = 0xFF; + sip[i].max_expected = 1; + sip[i].timeout = 3 * 60; /* 3 minutes */ + sip[i].me = THIS_MODULE; + sip[i].help = sip_help; + + tmpname = &sip_names[i][0]; + if (ports[i] == SIP_PORT) + sprintf(tmpname, "sip"); + else + sprintf(tmpname, "sip-%d", i); + sip[i].name = tmpname; + + DEBUGP("port #%d: %d\n", i, ports[i]); + + ret = ip_conntrack_helper_register(&sip[i]); + if (ret) { + printk("ERROR registering helper for port %d\n", + ports[i]); + fini(); + return ret; + } + } + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c new file mode 100644 index 000000000000..6ffba63adca2 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_sip.c @@ -0,0 +1,249 @@ +/* SIP extension for UDP NAT alteration. + * + * (C) 2005 by Christian Hentschel + * based on RR's ip_nat_ftp.c and other modules. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel "); +MODULE_DESCRIPTION("SIP NAT helper"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +extern struct sip_header_nfo ct_sip_hdrs[]; + +static unsigned int mangle_sip_packet(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr, size_t dlen, + char *buffer, int bufflen, + struct sip_header_nfo *hnfo) +{ + unsigned int matchlen, matchoff; + + if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0) + return 0; + + if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, + matchoff, matchlen, buffer, bufflen)) + return 0; + + /* We need to reload this. Thanks Patrick. */ + *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + return 1; +} + +static unsigned int ip_nat_sip(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + unsigned int bufflen, dataoff; + u_int32_t ip; + u_int16_t port; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + ip = ct->tuplehash[!dir].tuple.dst.ip; + port = ct->tuplehash[!dir].tuple.dst.u.udp.port; + bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port)); + + /* short packet ? */ + if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1)) + return 0; + + /* Basic rules: requests and responses. */ + if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) { + const char *aux; + + if ((ctinfo) < IP_CT_IS_REPLY) { + mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, + &ct_sip_hdrs[POS_CONTACT]); + return 1; + } + + if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_VIA])) + return 0; + + /* This search should ignore case, but later.. */ + aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1, + (*pskb)->len - dataoff); + if (!aux) + return 0; + + if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"), + ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff))) + return 1; + + return mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, + &ct_sip_hdrs[POS_CONTACT]); + } + if ((ctinfo) < IP_CT_IS_REPLY) { + if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_VIA])) + return 0; + + /* Mangle Contact if exists only. - watch udp_nat_mangle()! */ + mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]); + return 1; + } + /* This mangle requests headers. */ + return mangle_sip_packet(pskb, ctinfo, ct, dptr, + ct_sip_lnlen(*dptr, + *dptr + (*pskb)->len - dataoff), + buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]); +} + +static int mangle_content_len(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char *dptr) +{ + unsigned int dataoff, matchoff, matchlen; + char buffer[sizeof("65536")]; + int bufflen; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + /* Get actual SDP lenght */ + if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, + &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) { + + /* since ct_sip_get_info() give us a pointer passing 'v=' + we need to add 2 bytes in this count. */ + int c_len = (*pskb)->len - dataoff - matchoff + 2; + + /* Now, update SDP lenght */ + if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, + &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) { + + bufflen = sprintf(buffer, "%u", c_len); + + return ip_nat_mangle_udp_packet(pskb, ct, ctinfo, + matchoff, matchlen, + buffer, bufflen); + } + } + return 0; +} + +static unsigned int mangle_sdp(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + u_int32_t newip, u_int16_t port, + const char *dptr) +{ + char buffer[sizeof("nnn.nnn.nnn.nnn")]; + unsigned int dataoff, bufflen; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + /* Mangle owner and contact info. */ + bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_OWNER])) + return 0; + + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION])) + return 0; + + /* Mangle media port. */ + bufflen = sprintf(buffer, "%u", port); + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_MEDIA])) + return 0; + + return mangle_content_len(pskb, ctinfo, ct, dptr); +} + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int ip_nat_sdp(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp, + const char *dptr) +{ + struct ip_conntrack *ct = exp->master; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + u_int32_t newip; + u_int16_t port; + + DEBUGP("ip_nat_sdp():\n"); + + /* Connection will come from reply */ + newip = ct->tuplehash[!dir].tuple.dst.ip; + + exp->tuple.dst.ip = newip; + exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; + exp->dir = !dir; + + /* When you see the packet, we need to NAT it the same as the + this one. */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) { + exp->tuple.dst.u.udp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) + return NF_DROP; + + if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) { + ip_conntrack_unexpect_related(exp); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit fini(void) +{ + ip_nat_sip_hook = NULL; + ip_nat_sdp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_sip_hook); + BUG_ON(ip_nat_sdp_hook); + ip_nat_sip_hook = ip_nat_sip; + ip_nat_sdp_hook = ip_nat_sdp; + return 0; +} + +module_init(init); +module_exit(fini); -- cgit v1.2.3 From c45fb1089e714146206d7e295ff337893424c874 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 29 May 2006 18:27:32 -0700 Subject: [NETFILTER]: PPTP helper: fixup gre_keymap_lookup() return type GRE keys are 16-bit wide. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_gre.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 56794797d55b..21ee124c0463 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c @@ -77,10 +77,10 @@ static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km, } /* look up the source key for a given tuple */ -static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t) +static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t) { struct ip_ct_gre_keymap *km; - u_int32_t key = 0; + __be16 key = 0; read_lock_bh(&ip_ct_gre_lock); km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, @@ -190,7 +190,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb, struct ip_conntrack_tuple *tuple) { struct gre_hdr_pptp _pgrehdr, *pgrehdr; - u_int32_t srckey; + __be16 srckey; struct gre_hdr _grehdr, *grehdr; /* first only delinearize old RFC1701 GRE header */ -- cgit v1.2.3 From 7c106d7e782bd4805f39da30e81018f861b4b8c5 Mon Sep 17 00:00:00 2001 From: Wong Hoi Sing Edison Date: Mon, 5 Jun 2006 17:27:58 -0700 Subject: [TCP]: TCP Low Priority congestion control TCP Low Priority is a distributed algorithm whose goal is to utilize only the excess network bandwidth as compared to the ``fair share`` of bandwidth as targeted by TCP. Available from: http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf Original Author: Aleksandar Kuzmanovic See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation. As of 2.6.13, Linux supports pluggable congestion control algorithms. Due to the limitation of the API, we take the following changes from the original TCP-LP implementation: o We use newReno in most core CA handling. Only add some checking within cong_avoid. o Error correcting in remote HZ, therefore remote HZ will be keeped on checking and updating. o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne OWD have a similar meaning as RTT. Also correct the buggy formular. o Handle reaction for Early Congestion Indication (ECI) within pkts_acked, as mentioned within pseudo code. o OWD is handled in relative format, where local time stamp will in tcp_time_stamp format. Port from 2.4.19 to 2.6.16 as module by: Wong Hoi Sing Edison Hung Hing Lun Signed-off-by: Wong Hoi Sing Edison Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 10 ++ net/ipv4/Makefile | 1 + net/ipv4/tcp_lp.c | 338 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 349 insertions(+) create mode 100644 net/ipv4/tcp_lp.c (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 35eb70b1448d..ae85149a0434 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -550,6 +550,16 @@ config TCP_CONG_SCALABLE properties, though is known to have fairness issues. See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ +config TCP_CONG_LP + tristate "TCP Low Priority" + depends on EXPERIMENTAL + default n + ---help--- + TCP Low Priority (TCP-LP), a distributed algorithm whose goal is + to utiliza only the excess network bandwidth as compared to the + ``fair share`` of bandwidth as targeted by TCP. + See http://www-ece.rice.edu/networks/TCP-LP/ + endmenu config TCP_CONG_BIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4cc94482eacc..58a7a4634f70 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o +obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c new file mode 100644 index 000000000000..1f977b6ee9a1 --- /dev/null +++ b/net/ipv4/tcp_lp.c @@ -0,0 +1,338 @@ +/* + * TCP Low Priority (TCP-LP) + * + * TCP Low Priority is a distributed algorithm whose goal is to utilize only + * the excess network bandwidth as compared to the ``fair share`` of + * bandwidth as targeted by TCP. Available from: + * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf + * + * Original Author: + * Aleksandar Kuzmanovic + * + * See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation. + * As of 2.6.13, Linux supports pluggable congestion control algorithms. + * Due to the limitation of the API, we take the following changes from + * the original TCP-LP implementation: + * o We use newReno in most core CA handling. Only add some checking + * within cong_avoid. + * o Error correcting in remote HZ, therefore remote HZ will be keeped + * on checking and updating. + * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne + * OWD have a similar meaning as RTT. Also correct the buggy formular. + * o Handle reaction for Early Congestion Indication (ECI) within + * pkts_acked, as mentioned within pseudo code. + * o OWD is handled in relative format, where local time stamp will in + * tcp_time_stamp format. + * + * Port from 2.4.19 to 2.6.16 as module by: + * Wong Hoi Sing Edison + * Hung Hing Lun + * + * Version: $Id: tcp_lp.c,v 1.22 2006-05-02 18:18:19 hswong3i Exp $ + */ + +#include +#include +#include + +/* resolution of owd */ +#define LP_RESOL 1000 + +/** + * enum tcp_lp_state + * @LP_VALID_RHZ: is remote HZ valid? + * @LP_VALID_OWD: is OWD valid? + * @LP_WITHIN_THR: are we within threshold? + * @LP_WITHIN_INF: are we within inference? + * + * TCP-LP's state flags. + * We create this set of state flag mainly for debugging. + */ +enum tcp_lp_state { + LP_VALID_RHZ = (1 << 0), + LP_VALID_OWD = (1 << 1), + LP_WITHIN_THR = (1 << 3), + LP_WITHIN_INF = (1 << 4), +}; + +/** + * struct lp + * @flag: TCP-LP state flag + * @sowd: smoothed OWD << 3 + * @owd_min: min OWD + * @owd_max: max OWD + * @owd_max_rsv: resrved max owd + * @remote_hz: estimated remote HZ + * @remote_ref_time: remote reference time + * @local_ref_time: local reference time + * @last_drop: time for last active drop + * @inference: current inference + * + * TCP-LP's private struct. + * We get the idea from original TCP-LP implementation where only left those we + * found are really useful. + */ +struct lp { + u32 flag; + u32 sowd; + u32 owd_min; + u32 owd_max; + u32 owd_max_rsv; + u32 remote_hz; + u32 remote_ref_time; + u32 local_ref_time; + u32 last_drop; + u32 inference; +}; + +/** + * tcp_lp_init + * + * Init all required variables. + * Clone the handling from Vegas module implementation. + */ +static void tcp_lp_init(struct sock *sk) +{ + struct lp *lp = inet_csk_ca(sk); + + lp->flag = 0; + lp->sowd = 0; + lp->owd_min = 0xffffffff; + lp->owd_max = 0; + lp->owd_max_rsv = 0; + lp->remote_hz = 0; + lp->remote_ref_time = 0; + lp->local_ref_time = 0; + lp->last_drop = 0; + lp->inference = 0; +} + +/** + * tcp_lp_cong_avoid + * + * Implementation of cong_avoid. + * Will only call newReno CA when away from inference. + * From TCP-LP's paper, this will be handled in additive increasement. + */ +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, + int flag) +{ + struct lp *lp = inet_csk_ca(sk); + + if (!(lp->flag & LP_WITHIN_INF)) + tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); +} + +/** + * tcp_lp_remote_hz_estimator + * + * Estimate remote HZ. + * We keep on updating the estimated value, where original TCP-LP + * implementation only guest it for once and use forever. + */ +static u32 tcp_lp_remote_hz_estimator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */ + s64 m = 0; + + /* not yet record reference time + * go away!! record it before come back!! */ + if (lp->remote_ref_time == 0 || lp->local_ref_time == 0) + goto out; + + /* we can't calc remote HZ with no different!! */ + if (tp->rx_opt.rcv_tsval == lp->remote_ref_time + || tp->rx_opt.rcv_tsecr == lp->local_ref_time) + goto out; + + m = HZ * (tp->rx_opt.rcv_tsval - + lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - + lp->local_ref_time); + if (m < 0) + m = -m; + + if (rhz != 0) { + m -= rhz >> 6; /* m is now error in remote HZ est */ + rhz += m; /* 63/64 old + 1/64 new */ + } else + rhz = m << 6; + + /* record time for successful remote HZ calc */ + lp->flag |= LP_VALID_RHZ; + + out: + /* record reference time stamp */ + lp->remote_ref_time = tp->rx_opt.rcv_tsval; + lp->local_ref_time = tp->rx_opt.rcv_tsecr; + + return rhz >> 6; +} + +/** + * tcp_lp_owd_calculator + * + * Calculate one way delay (in relative format). + * Original implement OWD as minus of remote time difference to local time + * difference directly. As this time difference just simply equal to RTT, when + * the network status is stable, remote RTT will equal to local RTT, and result + * OWD into zero. + * It seems to be a bug and so we fixed it. + */ +static u32 tcp_lp_owd_calculator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 owd = 0; + + lp->remote_hz = tcp_lp_remote_hz_estimator(sk); + + if (lp->flag & LP_VALID_RHZ) { + owd = + tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - + tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); + if (owd < 0) + owd = -owd; + } + + if (owd > 0) + lp->flag |= LP_VALID_OWD; + else + lp->flag &= ~LP_VALID_OWD; + + return owd; +} + +/** + * tcp_lp_rtt_sample + * + * Implementation or rtt_sample. + * Will take the following action, + * 1. calc OWD, + * 2. record the min/max OWD, + * 3. calc smoothed OWD (SOWD). + * Most ideas come from the original TCP-LP implementation. + */ +static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) +{ + struct lp *lp = inet_csk_ca(sk); + s64 mowd = tcp_lp_owd_calculator(sk); + + /* sorry that we don't have valid data */ + if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD)) + return; + + /* record the next min owd */ + if (mowd < lp->owd_min) + lp->owd_min = mowd; + + /* always forget the max of the max + * we just set owd_max as one below it */ + if (mowd > lp->owd_max) { + if (mowd > lp->owd_max_rsv) { + if (lp->owd_max_rsv == 0) + lp->owd_max = mowd; + else + lp->owd_max = lp->owd_max_rsv; + lp->owd_max_rsv = mowd; + } else + lp->owd_max = mowd; + } + + /* calc for smoothed owd */ + if (lp->sowd != 0) { + mowd -= lp->sowd >> 3; /* m is now error in owd est */ + lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */ + } else + lp->sowd = mowd << 3; /* take the measured time be owd */ +} + +/** + * tcp_lp_pkts_acked + * + * Implementation of pkts_acked. + * Deal with active drop under Early Congestion Indication. + * Only drop to half and 1 will be handle, because we hope to use back + * newReno in increase case. + * We work it out by following the idea from TCP-LP's paper directly + */ +static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + + /* calc inference */ + if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) + lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); + + /* test if within inference */ + if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) + lp->flag |= LP_WITHIN_INF; + else + lp->flag &= ~LP_WITHIN_INF; + + /* test if within threshold */ + if (lp->sowd >> 3 < + lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100) + lp->flag |= LP_WITHIN_THR; + else + lp->flag &= ~LP_WITHIN_THR; + + pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag, + tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max, + lp->sowd >> 3); + + if (lp->flag & LP_WITHIN_THR) + return; + + /* FIXME: try to reset owd_min and owd_max here + * so decrease the chance the min/max is no longer suitable + * and will usually within threshold when whithin inference */ + lp->owd_min = lp->sowd >> 3; + lp->owd_max = lp->sowd >> 2; + lp->owd_max_rsv = lp->sowd >> 2; + + /* happened within inference + * drop snd_cwnd into 1 */ + if (lp->flag & LP_WITHIN_INF) + tp->snd_cwnd = 1U; + + /* happened after inference + * cut snd_cwnd into half */ + else + tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); + + /* record this drop time */ + lp->last_drop = tcp_time_stamp; +} + +static struct tcp_congestion_ops tcp_lp = { + .init = tcp_lp_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_lp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_lp_rtt_sample, + .pkts_acked = tcp_lp_pkts_acked, + + .owner = THIS_MODULE, + .name = "lp" +}; + +static int __init tcp_lp_register(void) +{ + BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_lp); +} + +static void __exit tcp_lp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_lp); +} + +module_init(tcp_lp_register); +module_exit(tcp_lp_unregister); + +MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Low Priority"); -- cgit v1.2.3 From 76f1017757aa0c308a0b83ca611c9a89ee9a79a4 Mon Sep 17 00:00:00 2001 From: Bin Zhou Date: Mon, 5 Jun 2006 17:28:30 -0700 Subject: [TCP]: TCP Veno congestion control TCP Veno module is a new congestion control module to improve TCP performance over wireless networks. The key innovation in TCP Veno is the enhancement of TCP Reno/Sack congestion control algorithm by using the estimated state of a connection based on TCP Vegas. This scheme significantly reduces "blind" reduction of TCP window regardless of the cause of packet loss. This work is based on the research paper "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." C. P. Fu, S. C. Liew, IEEE Journal on Selected Areas in Communication, Feb. 2003. Original paper and many latest research works on veno: http://www.ntu.edu.sg/home/ascpfu/veno/veno.html Signed-off-by: Bin Zhou Cheng Peng Fu Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 12 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_veno.c | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+) create mode 100644 net/ipv4/tcp_veno.c (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index ae85149a0434..8514106761b0 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -560,6 +560,18 @@ config TCP_CONG_LP ``fair share`` of bandwidth as targeted by TCP. See http://www-ece.rice.edu/networks/TCP-LP/ +config TCP_CONG_VENO + tristate "TCP Veno" + depends on EXPERIMENTAL + default n + ---help--- + TCP Veno is a sender-side only enhancement of TCP to obtain better + throughput over wireless networks. TCP Veno makes use of state + distinguishing to circumvent the difficult judgment of the packet loss + type. TCP Veno cuts down less congestion window in response to random + loss packets. + See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf + endmenu config TCP_CONG_BIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 58a7a4634f70..00fcd2c1ba78 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c new file mode 100644 index 000000000000..1091671751c4 --- /dev/null +++ b/net/ipv4/tcp_veno.c @@ -0,0 +1,238 @@ +/* + * TCP Veno congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * C. P. Fu, S. C. Liew. + * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." + * IEEE Journal on Selected Areas in Communication, + * Feb. 2003. + * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf + */ + +#include +#include +#include +#include +#include + +#include + +/* Default values of the Veno variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +static const int beta = 3 << V_PARAM_SHIFT; + +/* Veno variables */ +struct veno { + u8 doing_veno_now; /* if true, do veno for this rtt */ + u16 cntrtt; /* # of rtts measured within last rtt */ + u32 minrtt; /* min of rtts measured within last rtt (in usec) */ + u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ + u32 inc; /* decide whether to increase cwnd */ + u32 diff; /* calculate the diff rate */ +}; + +/* There are several situations when we must "re-start" Veno: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + */ +static inline void veno_enable(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + /* turn on Veno */ + veno->doing_veno_now = 1; + + veno->minrtt = 0x7fffffff; +} + +static inline void veno_disable(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + /* turn off Veno */ + veno->doing_veno_now = 0; +} + +static void tcp_veno_init(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + veno->basertt = 0x7fffffff; + veno->inc = 1; + veno_enable(sk); +} + +/* Do rtt sampling needed for Veno. */ +static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt) +{ + struct veno *veno = inet_csk_ca(sk); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */ + + /* Filter to find propagation delay: */ + if (vrtt < veno->basertt) + veno->basertt = vrtt; + + /* Find the min rtt during the last rtt to find + * the current prop. delay + queuing delay: + */ + veno->minrtt = min(veno->minrtt, vrtt); + veno->cntrtt++; +} + +static void tcp_veno_state(struct sock *sk, u8 ca_state) +{ + if (ca_state == TCP_CA_Open) + veno_enable(sk); + else + veno_disable(sk); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Veno calculations + * until we get fresh rtt samples. So when we + * restart, we reset our Veno state to a clean + * state. After we get acks for this flight of + * packets, _then_ we can make Veno calculations + * again. + */ +static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) + tcp_veno_init(sk); +} + +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + + if (!veno->doing_veno_now) + return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + + /* limited by applications */ + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* We do the Veno calculations only if we got enough rtt samples */ + if (veno->cntrtt <= 2) { + /* We don't have enough rtt samples to do the Veno + * calculation, so we'll behave like Reno. + */ + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + } else { + u32 rtt, target_cwnd; + + /* We have enough rtt samples, so, using the Veno + * algorithm, we determine the state of the network. + */ + + rtt = veno->minrtt; + + target_cwnd = ((tp->snd_cwnd * veno->basertt) + << V_PARAM_SHIFT) / rtt; + + veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* Slow start. */ + tcp_slow_start(tp); + } else { + /* Congestion avoidance. */ + if (veno->diff < beta) { + /* In the "non-congestive state", increase cwnd + * every rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } else { + /* In the "congestive state", increase cwnd + * every other rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (veno->inc + && tp->snd_cwnd < + tp->snd_cwnd_clamp) { + tp->snd_cwnd++; + veno->inc = 0; + } else + veno->inc = 1; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + + } + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } + /* Wipe the slate clean for the next rtt. */ + /* veno->cntrtt = 0; */ + veno->minrtt = 0x7fffffff; +} + +/* Veno MD phase */ +static u32 tcp_veno_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + + if (veno->diff < beta) + /* in "non-congestive state", cut cwnd by 1/5 */ + return max(tp->snd_cwnd * 4 / 5, 2U); + else + /* in "congestive state", cut cwnd by 1/2 */ + return max(tp->snd_cwnd >> 1U, 2U); +} + +static u32 tcp_veno_min_cwnd(struct sock * sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return tp->snd_ssthresh; +} + +static struct tcp_congestion_ops tcp_veno = { + .init = tcp_veno_init, + .ssthresh = tcp_veno_ssthresh, + .cong_avoid = tcp_veno_cong_avoid, + .min_cwnd = tcp_veno_min_cwnd, + .rtt_sample = tcp_veno_rtt_calc, + .set_state = tcp_veno_state, + .cwnd_event = tcp_veno_cwnd_event, + + .owner = THIS_MODULE, + .name = "veno", +}; + +static int __init tcp_veno_register(void) +{ + BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_veno); + return 0; +} + +static void __exit tcp_veno_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_veno); +} + +module_init(tcp_veno_register); +module_exit(tcp_veno_unregister); + +MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Veno"); -- cgit v1.2.3 From f890f921040fef6a35e39d15b729af1fd1a35f29 Mon Sep 17 00:00:00 2001 From: "Angelo P. Castellani" Date: Mon, 5 Jun 2006 17:29:09 -0700 Subject: [TCP]: TCP Compound congestion control TCP Compound is a sender-side only change to TCP that uses a mixed Reno/Vegas approach to calculate the cwnd. For further details look here: ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf Signed-off-by: Angelo P. Castellani Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 10 ++ net/ipv4/Makefile | 1 + net/ipv4/tcp_compound.c | 407 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 418 insertions(+) create mode 100644 net/ipv4/tcp_compound.c (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 8514106761b0..da33393be45f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -572,6 +572,16 @@ config TCP_CONG_VENO loss packets. See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf +config TCP_CONG_COMPOUND + tristate "TCP Compound" + depends on EXPERIMENTAL + default n + ---help--- + TCP Compound is a sender-side only change to TCP that uses + a mixed Reno/Vegas approach to calculate the cwnd. + For further details look here: + ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf + endmenu config TCP_CONG_BIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 00fcd2c1ba78..f30faefc2672 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o +obj-$(CONFIG_TCP_CONG_COMPOUND) += tcp_compound.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c new file mode 100644 index 000000000000..8c1ebfb7659e --- /dev/null +++ b/net/ipv4/tcp_compound.c @@ -0,0 +1,407 @@ +/* + * TCP Vegas congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + * + * + * TCP Compound based on TCP Vegas + * + * further details can be found here: + * ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf + */ + +#include +#include +#include +#include +#include + +#include + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 + +#define TCP_COMPOUND_ALPHA 3U +#define TCP_COMPOUND_BETA 1U +#define TCP_COMPOUND_KAPPA_POW 3 +#define TCP_COMPOUND_KAPPA_NSQRT 2 +#define TCP_COMPOUND_GAMMA 30 +#define TCP_COMPOUND_ZETA 1 + +/* TCP compound variables */ +struct compound { + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now; /* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ + + u32 cwnd; + u32 dwnd; +}; + +/* There are several situations when we must "re-start" Vegas: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + * In these circumstances we cannot do a Vegas calculation at the + * end of the first RTT, because any calculation we do is using + * stale info -- both the saved cwnd and congestion feedback are + * stale. + * + * Instead we must wait until the completion of an RTT during + * which we actually receive ACKs. + */ +static inline void vegas_enable(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct compound *vegas = inet_csk_ca(sk); + + /* Begin taking Vegas samples next time we send something. */ + vegas->doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + vegas->beg_snd_nxt = tp->snd_nxt; + + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; +} + +/* Stop taking Vegas samples for now. */ +static inline void vegas_disable(struct sock *sk) +{ + struct compound *vegas = inet_csk_ca(sk); + + vegas->doing_vegas_now = 0; +} + +static void tcp_compound_init(struct sock *sk) +{ + struct compound *vegas = inet_csk_ca(sk); + const struct tcp_sock *tp = tcp_sk(sk); + + vegas->baseRTT = 0x7fffffff; + vegas_enable(sk); + + vegas->dwnd = 0; + vegas->cwnd = tp->snd_cwnd; +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static void tcp_compound_rtt_calc(struct sock *sk, u32 usrtt) +{ + struct compound *vegas = inet_csk_ca(sk); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < vegas->baseRTT) + vegas->baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + + vegas->minRTT = min(vegas->minRTT, vrtt); + vegas->cntRTT++; +} + +static void tcp_compound_state(struct sock *sk, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + vegas_enable(sk); + else + vegas_disable(sk); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ +static void tcp_compound_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) + tcp_compound_init(sk); +} + +static void tcp_compound_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct compound *vegas = inet_csk_ca(sk); + u8 inc = 0; + + if (vegas->cwnd + vegas->dwnd > tp->snd_cwnd) { + if (vegas->cwnd > tp->snd_cwnd || vegas->dwnd > tp->snd_cwnd) { + vegas->cwnd = tp->snd_cwnd; + vegas->dwnd = 0; + } else + vegas->cwnd = tp->snd_cwnd - vegas->dwnd; + + } + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (vegas->cwnd <= tp->snd_ssthresh) + inc = 1; + else if (tp->snd_cwnd_cnt < tp->snd_cwnd) + tp->snd_cwnd_cnt++; + + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + inc = 1; + tp->snd_cwnd_cnt = 0; + } + + if (inc && tp->snd_cwnd < tp->snd_cwnd_clamp) + vegas->cwnd++; + + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, vegas->beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + if (!tp->mss_cache) + return; + + old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = vegas->beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + vegas->beg_snd_una = vegas->beg_snd_nxt; + vegas->beg_snd_nxt = tp->snd_nxt; + vegas->beg_snd_cwnd = tp->snd_cwnd; + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (vegas->cntRTT > 2) { + u32 rtt, target_cwnd, diff; + u32 brtt, dwnd; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = vegas->minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + if (!rtt) + return; + + brtt = vegas->baseRTT; + target_cwnd = ((old_wnd * brtt) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + dwnd = vegas->dwnd; + + if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) { + u32 i, j, x, x2; + u64 v; + + v = 1; + + for (i = 0; i < TCP_COMPOUND_KAPPA_POW; i++) + v *= old_wnd; + + for (i = 0; i < TCP_COMPOUND_KAPPA_NSQRT; i++) { + x = 1; + for (j = 0; j < 200; j++) { + x2 = (x + v / x) / 2; + + if (x2 == x || !x2) + break; + + x = x2; + } + v = x; + } + + x = (u32) v >> TCP_COMPOUND_ALPHA; + + if (x > 1) + dwnd = x - 1; + else + dwnd = 0; + + dwnd += vegas->dwnd; + + } else if ((dwnd << V_PARAM_SHIFT) < + (diff * TCP_COMPOUND_BETA)) + dwnd = 0; + else + dwnd = + ((dwnd << V_PARAM_SHIFT) - + (diff * + TCP_COMPOUND_BETA)) >> V_PARAM_SHIFT; + + vegas->dwnd = dwnd; + + } + + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } + + tp->snd_cwnd = vegas->cwnd + vegas->dwnd; +} + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_compound_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + const struct compound *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcpvegas_info *info; + + info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, + sizeof(*info))); + + info->tcpv_enabled = ca->doing_vegas_now; + info->tcpv_rttcnt = ca->cntRTT; + info->tcpv_rtt = ca->baseRTT; + info->tcpv_minrtt = ca->minRTT; + rtattr_failure:; + } +} + +static struct tcp_congestion_ops tcp_compound = { + .init = tcp_compound_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_compound_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_compound_rtt_calc, + .set_state = tcp_compound_state, + .cwnd_event = tcp_compound_cwnd_event, + .get_info = tcp_compound_get_info, + + .owner = THIS_MODULE, + .name = "compound", +}; + +static int __init tcp_compound_register(void) +{ + BUG_ON(sizeof(struct compound) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_compound); + return 0; +} + +static void __exit tcp_compound_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_compound); +} + +module_init(tcp_compound_register); +module_exit(tcp_compound_unregister); + +MODULE_AUTHOR("Angelo P. Castellani, Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Compound"); -- cgit v1.2.3 From a4ed25849532728effaa0665c92e08e029e41407 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:29:39 -0700 Subject: [TCP]: TCP Compound quad root function The original code did a 64 bit divide directly, which won't work on 32 bit platforms. Rather than doing a 64 bit square root twice, just implement a 4th root function in one pass using Newton's method. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_compound.c | 90 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 24 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c index 8c1ebfb7659e..ec68cb8081c1 100644 --- a/net/ipv4/tcp_compound.c +++ b/net/ipv4/tcp_compound.c @@ -52,8 +52,6 @@ #define TCP_COMPOUND_ALPHA 3U #define TCP_COMPOUND_BETA 1U -#define TCP_COMPOUND_KAPPA_POW 3 -#define TCP_COMPOUND_KAPPA_NSQRT 2 #define TCP_COMPOUND_GAMMA 30 #define TCP_COMPOUND_ZETA 1 @@ -156,6 +154,58 @@ static void tcp_compound_state(struct sock *sk, u8 ca_state) vegas_disable(sk); } + +/* 64bit divisor, dividend and result. dynamic precision */ +static inline u64 div64_64(u64 dividend, u64 divisor) +{ + u32 d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (u32) dividend / d; + + return dividend; +} + +/* calculate the quartic root of "a" using Newton-Raphson */ +static u32 qroot(u64 a) +{ + u32 x, x1; + + /* Initial estimate is based on: + * qrt(x) = exp(log(x) / 4) + */ + x = 1u << (fls64(a) >> 2); + + /* + * Iteration based on: + * 3 + * x = ( 3 * x + a / x ) / 4 + * k+1 k k + */ + do { + u64 x3 = x; + + x1 = x; + x3 *= x; + x3 *= x; + + x = (3 * x + (u32) div64_64(a, x3)) / 4; + } while (abs(x1 - x) > 1); + + return x; +} + + /* * If the connection is idle and we are restarting, * then we don't want to do any Vegas calculations @@ -304,29 +354,21 @@ static void tcp_compound_cong_avoid(struct sock *sk, u32 ack, dwnd = vegas->dwnd; if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) { - u32 i, j, x, x2; u64 v; - - v = 1; - - for (i = 0; i < TCP_COMPOUND_KAPPA_POW; i++) - v *= old_wnd; - - for (i = 0; i < TCP_COMPOUND_KAPPA_NSQRT; i++) { - x = 1; - for (j = 0; j < 200; j++) { - x2 = (x + v / x) / 2; - - if (x2 == x || !x2) - break; - - x = x2; - } - v = x; - } - - x = (u32) v >> TCP_COMPOUND_ALPHA; - + u32 x; + + /* + * The TCP Compound paper describes the choice + * of "k" determines the agressiveness, + * ie. slope of the response function. + * + * For same value as HSTCP would be 0.8 + * but for computaional reasons, both the + * original authors and this implementation + * use 0.75. + */ + v = old_wnd; + x = qroot(v * v * v) >> TCP_COMPOUND_ALPHA; if (x > 1) dwnd = x - 1; else -- cgit v1.2.3 From 72dc5b9225c53310c010b68a70ea97c8c8e24bdf Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:30:08 -0700 Subject: [TCP]: Minimum congestion window consolidation. Many of the TCP congestion methods all just use ssthresh as the minimum congestion window on decrease. Rather than duplicating the code, just have that be the default if that handle in the ops structure is not set. Minor behaviour change to TCP compound. It probably wants to use this (ssthresh) as lower bound, rather than ssthresh/2 because the latter causes undershoot on loss. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++-- net/ipv4/tcp_bic.c | 7 ------- net/ipv4/tcp_compound.c | 1 - net/ipv4/tcp_cong.c | 6 +++--- net/ipv4/tcp_cubic.c | 6 ------ net/ipv4/tcp_htcp.c | 9 --------- net/ipv4/tcp_input.c | 13 +++++++++++-- net/ipv4/tcp_veno.c | 7 ------- net/ipv4/tcp_westwood.c | 18 +++++++----------- 9 files changed, 23 insertions(+), 48 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index f1f472746e6c..de88c5472bfc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -632,7 +632,7 @@ struct tcp_congestion_ops { /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); /* lower bound for congestion window (optional) */ - u32 (*min_cwnd)(struct sock *sk); + u32 (*min_cwnd)(const struct sock *sk); /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int good_ack); @@ -667,7 +667,7 @@ extern struct tcp_congestion_ops tcp_init_congestion_ops; extern u32 tcp_reno_ssthresh(struct sock *sk); extern void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag); -extern u32 tcp_reno_min_cwnd(struct sock *sk); +extern u32 tcp_reno_min_cwnd(const struct sock *sk); extern struct tcp_congestion_ops tcp_reno; static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 035f2092d73a..b2d9021ad22b 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) return max(tp->snd_cwnd, ca->last_max_cwnd); } -static u32 bictcp_min_cwnd(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) @@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, - .min_cwnd = bictcp_min_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "bic", diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c index ec68cb8081c1..bc54f7e9aea9 100644 --- a/net/ipv4/tcp_compound.c +++ b/net/ipv4/tcp_compound.c @@ -419,7 +419,6 @@ static struct tcp_congestion_ops tcp_compound = { .init = tcp_compound_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_compound_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .rtt_sample = tcp_compound_rtt_calc, .set_state = tcp_compound_state, .cwnd_event = tcp_compound_cwnd_event, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 91c2f41c7f58..857eefc52aab 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) int ret = 0; /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { + if (!ca->ssthresh || !ca->cong_avoid) { printk(KERN_ERR "TCP %s does not implement required ops\n", ca->name); return -EINVAL; @@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window. */ -u32 tcp_reno_min_cwnd(struct sock *sk) +/* Lower bound on congestion window with halving. */ +u32 tcp_reno_min_cwnd(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh/2; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 31a4986dfbf7..78b7a6b9e4de 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); } -static u32 bictcp_min_cwnd(struct sock *sk) -{ - return tcp_sk(sk)->snd_ssthresh; -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) @@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, - .min_cwnd = bictcp_min_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "cubic", diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 1b2ff53f98ed..3d92c1859267 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, } } -/* Lower bound on congestion window. */ -static u32 htcp_min_cwnd(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; -} - - static void htcp_init(struct sock *sk) { struct htcp *ca = inet_csk_ca(sk); @@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state) static struct tcp_congestion_ops htcp = { .init = htcp_init, .ssthresh = htcp_recalc_ssthresh, - .min_cwnd = htcp_min_cwnd, .cong_avoid = htcp_cong_avoid, .set_state = htcp_state, .undo_cwnd = htcp_cwnd_undo, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6d167889a4b0..e08245bdda3a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1689,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } +/* Lower bound on congestion window is slow start threshold + * unless congestion avoidance choice decides to overide it. + */ +static inline u32 tcp_cwnd_min(const struct sock *sk) +{ + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + + return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; +} + /* Decrease cwnd each second ack. */ static void tcp_cwnd_down(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) + if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 1091671751c4..11b42a7135c1 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -199,17 +199,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk) return max(tp->snd_cwnd >> 1U, 2U); } -static u32 tcp_veno_min_cwnd(struct sock * sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; -} - static struct tcp_congestion_ops tcp_veno = { .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .cong_avoid = tcp_veno_cong_avoid, - .min_cwnd = tcp_veno_min_cwnd, .rtt_sample = tcp_veno_rtt_calc, .set_state = tcp_veno_state, .cwnd_event = tcp_veno_cwnd_event, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 0c340c3756c2..29eb258b6d82 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -162,12 +162,6 @@ static inline u32 westwood_acked_count(struct sock *sk) return w->cumul_ack; } -static inline u32 westwood_bw_rttmin(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - const struct westwood *w = inet_csk_ca(sk); - return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); -} /* * TCP Westwood @@ -175,9 +169,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk) * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 * so avoids ever returning 0. */ -static u32 tcp_westwood_cwnd_min(struct sock *sk) +static u32 tcp_westwood_bw_rttmin(const struct sock *sk) { - return westwood_bw_rttmin(sk); + const struct tcp_sock *tp = tcp_sk(sk); + const struct westwood *w = inet_csk_ca(sk); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) @@ -191,11 +187,11 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) break; case CA_EVENT_COMPLETE_CWR: - tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); + tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; case CA_EVENT_FRTO: - tp->snd_ssthresh = westwood_bw_rttmin(sk); + tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; case CA_EVENT_SLOW_ACK: @@ -235,7 +231,7 @@ static struct tcp_congestion_ops tcp_westwood = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_westwood_cwnd_min, + .min_cwnd = tcp_westwood_bw_rttmin, .cwnd_event = tcp_westwood_event, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, -- cgit v1.2.3 From a42e9d6ce89cfd19aee9f990b7231ce697f0d00f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:30:32 -0700 Subject: [TCP]: TCP Probe congestion window tracing This adds a new module for tracking TCP state variables non-intrusively using kprobes. It has a simple /proc interface that outputs one line for each packet received. A sample usage is to collect congestion window and ssthresh over time graphs. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/Kconfig | 15 +++++ net/ipv4/Makefile | 1 + net/ipv4/tcp_probe.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 net/ipv4/tcp_probe.c (limited to 'net/ipv4') diff --git a/net/Kconfig b/net/Kconfig index 4193cdcd3ae7..ff0db804eed8 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -215,6 +215,21 @@ config NET_PKTGEN To compile this code as a module, choose M here: the module will be called pktgen. +config NET_TCPPROBE + tristate "TCP connection probing" + depends on INET && EXPERIMENTAL && PROC_FS && KPROBES + ---help--- + This module allows for capturing the changes to TCP connection + state in response to incoming patckets. It is used for debugging + TCP congestion avoidance modules. If you don't understand + what was just said, you don't need it: say N. + + Documentation on how to use the packet generator can be found + at http://linux-net.osdl.org/index.php/TcpProbe + + To compile this code as a module, choose M here: the + module will be called tcp_probe. + endmenu endmenu diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f30faefc2672..38b8039bdd55 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o +obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c new file mode 100644 index 000000000000..be94d526c7cb --- /dev/null +++ b/net/ipv4/tcp_probe.c @@ -0,0 +1,179 @@ +/* + * tcpprobe - Observe the TCP flow with kprobes. + * + * The idea for this came from Werner Almesberger's umlsim + * Copyright (C) 2004, Stephen Hemminger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +MODULE_AUTHOR("Stephen Hemminger "); +MODULE_DESCRIPTION("TCP cwnd snooper"); +MODULE_LICENSE("GPL"); + +static int port = 0; +MODULE_PARM_DESC(port, "Port to match (0=all)"); +module_param(port, int, 0); + +static int bufsize = 64*1024; +MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); +module_param(bufsize, int, 0); + +static const char procname[] = "tcpprobe"; + +struct { + struct kfifo *fifo; + spinlock_t lock; + wait_queue_head_t wait; + struct timeval tstart; +} tcpw; + +static void printl(const char *fmt, ...) +{ + va_list args; + int len; + struct timeval now; + char tbuf[256]; + + va_start(args, fmt); + do_gettimeofday(&now); + + now.tv_sec -= tcpw.tstart.tv_sec; + now.tv_usec -= tcpw.tstart.tv_usec; + if (now.tv_usec < 0) { + --now.tv_sec; + now.tv_usec += 1000000; + } + + len = sprintf(tbuf, "%lu.%06lu ", now.tv_sec, now.tv_usec); + len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); + va_end(args); + + kfifo_put(tcpw.fifo, tbuf, len); + wake_up(&tcpw.wait); +} + +static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + + if (port == 0 || ntohs(inet->dport) == port || + ntohs(inet->sport) == port) { + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n", + NIPQUAD(inet->saddr), ntohs(inet->sport), + NIPQUAD(inet->daddr), ntohs(inet->dport), + size, tp->snd_nxt, tp->snd_una, + tp->snd_cwnd, tcp_current_ssthresh(sk), + tp->snd_wnd); + } + + jprobe_return(); + return 0; +} + +static struct jprobe tcp_send_probe = { + .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, }, + .entry = (kprobe_opcode_t *) &jtcp_sendmsg, +}; + + +static int tcpprobe_open(struct inode * inode, struct file * file) +{ + kfifo_reset(tcpw.fifo); + do_gettimeofday(&tcpw.tstart); + return 0; +} + +static ssize_t tcpprobe_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + int error = 0, cnt; + unsigned char *tbuf; + + if (!buf || len < 0) + return -EINVAL; + + if (len == 0) + return 0; + + tbuf = vmalloc(len); + if (!tbuf) + return -ENOMEM; + + error = wait_event_interruptible(tcpw.wait, + __kfifo_len(tcpw.fifo) != 0); + if (error) + return error; + + cnt = kfifo_get(tcpw.fifo, tbuf, len); + error = copy_to_user(buf, tbuf, cnt); + + vfree(tbuf); + + return error ? error : cnt; +} + +static struct file_operations tcpprobe_fops = { + .owner = THIS_MODULE, + .open = tcpprobe_open, + .read = tcpprobe_read, +}; + +static __init int tcpprobe_init(void) +{ + int ret = -ENOMEM; + + init_waitqueue_head(&tcpw.wait); + spin_lock_init(&tcpw.lock); + tcpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &tcpw.lock); + + if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops)) + goto err0; + + ret = register_jprobe(&tcp_send_probe); + if (ret) + goto err1; + + pr_info("TCP watch registered (port=%d)\n", port); + return 0; + err1: + proc_net_remove(procname); + err0: + kfifo_free(tcpw.fifo); + return ret; +} +module_init(tcpprobe_init); + +static __exit void tcpprobe_exit(void) +{ + kfifo_free(tcpw.fifo); + proc_net_remove(procname); + unregister_jprobe(&tcp_send_probe); + +} +module_exit(tcpprobe_exit); -- cgit v1.2.3 From 738980ffa658c86bd494ebb242ce8e44aff16a9e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:30:56 -0700 Subject: [TCP]: Limited slow start for Highspeed TCP Implementation of RFC3742 limited slow start. Added as part of the TCP highspeed congestion control module. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_highspeed.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index ba7c63ca5bb1..1120245b2373 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -98,6 +98,10 @@ struct hstcp { u32 ai; }; +static int max_ssthresh = 100; +module_param(max_ssthresh, int, 0644); +MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)"); + static void hstcp_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -119,9 +123,23 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); - else { + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* RFC3742: limited slow start + * the window is increased by 1/K MSS for each arriving ACK, + * for K = int(cwnd/(0.5 max_ssthresh)) + */ + if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) { + u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U); + if (++tp->snd_cwnd_cnt >= k) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } else { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { /* Update AIMD parameters */ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && -- cgit v1.2.3 From 70df2311ee3fc607e7511873d7dade5bd17d593d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 5 Jun 2006 17:59:20 -0700 Subject: [TCP]: Fix compile warning in tcp_probe.c The suseconds_t et al. are not necessarily any particular type on every platform, so cast to unsigned long so that we can use one printf format string and avoid warnings across the board Signed-off-by: David S. Miller --- net/ipv4/tcp_probe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index be94d526c7cb..d7d517a3a238 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -68,7 +68,9 @@ static void printl(const char *fmt, ...) now.tv_usec += 1000000; } - len = sprintf(tbuf, "%lu.%06lu ", now.tv_sec, now.tv_usec); + len = sprintf(tbuf, "%lu.%06lu ", + (unsigned long) now.tv_sec, + (unsigned long) now.tv_usec); len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); va_end(args); -- cgit v1.2.3 From 338fcf9886df9ad2873772197a73a57818973316 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 5 Jun 2006 21:04:39 -0700 Subject: [IPV4] igmp: Fixup struct ip_mc_list::multiaddr type All users except two expect 32-bit big-endian value. One is of ->multiaddr = ->multiaddr variety. And last one is "%08lX". Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 +- net/ipv4/igmp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 28f4f3b36950..899c3d4776f3 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -169,7 +169,7 @@ struct ip_sf_list struct ip_mc_list { struct in_device *interface; - unsigned long multiaddr; + __be32 multiaddr; struct ip_sf_list *sources; struct ip_sf_list *tomb; unsigned int sfmode; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d512239a1473..ab680c851aa2 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) } seq_printf(seq, - "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", + "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", im->multiaddr, im->users, im->tm_running, im->tm_running ? jiffies_to_clock_t(im->timer.expires-jiffies) : 0, -- cgit v1.2.3 From 6d7416535097ed0943bdae8e69c14ba43061cab1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 5 Jun 2006 21:06:41 -0700 Subject: [IPV4]: Right prototype of __raw_v4_lookup() All users pass 32-bit values as addresses and internally they're compared with 32-bit entities. So, change "laddr" and "raddr" types to __be32. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/raw.h | 2 +- net/ipv4/raw.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/raw.h b/include/net/raw.h index e67b28a0248c..d83571fe4c69 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -36,7 +36,7 @@ extern rwlock_t raw_v4_lock; extern struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, - unsigned long raddr, unsigned long laddr, + __be32 raddr, __be32 laddr, int dif); extern int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index fc2562415555..bd221ec3f81e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk) } struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, - unsigned long raddr, unsigned long laddr, + __be32 raddr, __be32 laddr, int dif) { struct hlist_node *node; -- cgit v1.2.3 From f86502bfc177f69bbabbedb78ebf710579ae0e54 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 5 Jun 2006 21:19:24 -0700 Subject: [IPV4] icmp: Kill local 'ip' arg in icmp_redirect(). It is typed wrong, and it's only assigned and used once. So just pass in iph->daddr directly which fixes both problems. Based upon a patch by Alexey Dobriyan. Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2a0455911ee0..017900172f7d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -730,7 +730,6 @@ out_err: static void icmp_redirect(struct sk_buff *skb) { struct iphdr *iph; - unsigned long ip; if (skb->len < sizeof(struct iphdr)) goto out_err; @@ -742,7 +741,6 @@ static void icmp_redirect(struct sk_buff *skb) goto out; iph = (struct iphdr *)skb->data; - ip = iph->daddr; switch (skb->h.icmph->code & 7) { case ICMP_REDIR_NET: @@ -752,7 +750,8 @@ static void icmp_redirect(struct sk_buff *skb) */ case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: - ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway, + ip_rt_redirect(skb->nh.iph->saddr, iph->daddr, + skb->h.icmph->un.gateway, iph->saddr, skb->dev); break; } -- cgit v1.2.3 From 984bc16cc92ea3c247bf34ad667cfb95331b9d3c Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 9 Jun 2006 00:29:17 -0700 Subject: [SECMARK]: Add secmark support to core networking. Add a secmark field to the skbuff structure, to allow security subsystems to place security markings on network packets. This is similar to the nfmark field, except is intended for implementing security policy, rather than than networking policy. This patch was already acked in principle by Dave Miller. Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/skbuff.h | 22 ++++++++++++++++++++++ net/Kconfig | 7 +++++++ net/core/skbuff.c | 3 ++- net/ipv4/ip_output.c | 1 + net/ipv4/netfilter/ipt_REJECT.c | 1 + net/ipv6/ip6_output.c | 1 + 6 files changed, 34 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 23bad3bf3c9d..fe2c58e5306f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -210,6 +210,7 @@ enum { * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @tc_index: Traffic control index * @tc_verd: traffic control verdict + * @secmark: security marking */ struct sk_buff { @@ -289,6 +290,9 @@ struct sk_buff { #ifdef CONFIG_NET_DMA dma_cookie_t dma_cookie; #endif +#ifdef CONFIG_NETWORK_SECMARK + __u32 secmark; +#endif /* These elements must be at the end, see alloc_skb() for details. */ @@ -1400,5 +1404,23 @@ static inline void nf_reset(struct sk_buff *skb) static inline void nf_reset(struct sk_buff *skb) {} #endif /* CONFIG_NETFILTER */ +#ifdef CONFIG_NETWORK_SECMARK +static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) +{ + to->secmark = from->secmark; +} + +static inline void skb_init_secmark(struct sk_buff *skb) +{ + skb->secmark = 0; +} +#else +static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) +{ } + +static inline void skb_init_secmark(struct sk_buff *skb) +{ } +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/net/Kconfig b/net/Kconfig index ccadc8e48152..c6cec5aa5486 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -66,6 +66,13 @@ source "net/ipv6/Kconfig" endif # if INET +config NETWORK_SECMARK + bool "Security Marking" + help + This enables security marking of network packets, similar + to nfmark, but designated for security purposes. + If you are unsure how to answer this question, answer N. + menuconfig NETFILTER bool "Network packet filtering (replaces ipchains)" ---help--- diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fb3770f9c094..96cdcbe24ba2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -464,7 +464,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) n->tc_verd = CLR_TC_MUNGED(n->tc_verd); C(input_dev); #endif - + skb_copy_secmark(n, skb); #endif C(truesize); atomic_set(&n->users, 1); @@ -526,6 +526,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif new->tc_index = old->tc_index; #endif + skb_copy_secmark(new, old); atomic_set(&new->users, 1); skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cff9c3a72daf..d4bb3fae4e49 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -410,6 +410,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) nf_bridge_get(to->nf_bridge); #endif #endif + skb_copy_secmark(to, from); } /* diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 0bba3c2bb786..431a3ce6f7b7 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook) /* This packet will not be the same as the other: clear nf fields */ nf_reset(nskb); nskb->nfmark = 0; + skb_init_secmark(nskb); tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 416f6e428a0a..d29620f4910e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -459,6 +459,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) nf_bridge_get(to->nf_bridge); #endif #endif + skb_copy_secmark(to, from); } int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) -- cgit v1.2.3 From 7c9728c393dceb724d66d696cfabce82151a78e5 Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 9 Jun 2006 00:31:46 -0700 Subject: [SECMARK]: Add secmark support to conntrack Add a secmark field to IP and NF conntracks, so that security markings on packets can be copied to their associated connections, and also copied back to packets as required. This is similar to the network mark field currently used with conntrack, although it is intended for enforcement of security policy rather than network policy. Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_conntrack.h | 4 ++++ include/net/netfilter/nf_conntrack.h | 4 ++++ include/net/netfilter/nf_conntrack_compat.h | 26 ++++++++++++++++++++++++++ net/ipv4/netfilter/Kconfig | 12 ++++++++++++ net/ipv4/netfilter/ip_conntrack_core.c | 3 +++ net/ipv4/netfilter/ip_conntrack_standalone.c | 5 +++++ net/netfilter/Kconfig | 12 ++++++++++++ net/netfilter/nf_conntrack_core.c | 3 +++ net/netfilter/nf_conntrack_standalone.c | 5 +++++ 9 files changed, 74 insertions(+) (limited to 'net/ipv4') diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index 17d7ef938a09..e0e9951eb8c3 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -121,6 +121,10 @@ struct ip_conntrack u_int32_t mark; #endif +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK + u_int32_t secmark; +#endif + /* Traversed often, so hopefully in different cacheline to top */ /* These are my tuples; original and reply */ struct ip_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index dbe7a114d0c5..411117815807 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -114,6 +114,10 @@ struct nf_conn u_int32_t mark; #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + u_int32_t secmark; +#endif + /* Storage reserved for other modules: */ union nf_conntrack_proto proto; diff --git a/include/net/netfilter/nf_conntrack_compat.h b/include/net/netfilter/nf_conntrack_compat.h index 3cac19fb3648..f1b1482d7200 100644 --- a/include/net/netfilter/nf_conntrack_compat.h +++ b/include/net/netfilter/nf_conntrack_compat.h @@ -20,6 +20,19 @@ static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb, } #endif /* CONFIG_IP_NF_CONNTRACK_MARK */ +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK +static inline u_int32_t *nf_ct_get_secmark(const struct sk_buff *skb, + u_int32_t *ctinfo) +{ + struct ip_conntrack *ct = ip_conntrack_get(skb, ctinfo); + + if (ct) + return &ct->secmark; + else + return NULL; +} +#endif /* CONFIG_IP_NF_CONNTRACK_SECMARK */ + #ifdef CONFIG_IP_NF_CT_ACCT static inline struct ip_conntrack_counter * nf_ct_get_counters(const struct sk_buff *skb) @@ -70,6 +83,19 @@ static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb, } #endif /* CONFIG_NF_CONNTRACK_MARK */ +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static inline u_int32_t *nf_ct_get_secmark(const struct sk_buff *skb, + u_int32_t *ctinfo) +{ + struct nf_conn *ct = nf_ct_get(skb, ctinfo); + + if (ct) + return &ct->secmark; + else + return NULL; +} +#endif /* CONFIG_NF_CONNTRACK_MARK */ + #ifdef CONFIG_NF_CT_ACCT static inline struct ip_conntrack_counter * nf_ct_get_counters(const struct sk_buff *skb) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index ff4b118f14a9..e1d7f5fbc526 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK of packets, but this mark value is kept in the conntrack session instead of the individual packets. +config IP_NF_CONNTRACK_SECMARK + bool 'Connection tracking security mark support' + depends on IP_NF_CONNTRACK && NETWORK_SECMARK + help + This option enables security markings to be applied to + connections. Typically they are copied to connections from + packets using the CONNSECMARK target and copied back from + connections to packets with the same target, with the packets + being originally labeled via SECMARK. + + If unsure, say 'N'. + config IP_NF_CONNTRACK_EVENTS bool "Connection tracking events (EXPERIMENTAL)" depends on EXPERIMENTAL && IP_NF_CONNTRACK diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 4fe9e69378df..7e4cf9a4d15f 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -723,6 +723,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple, defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) /* this is ugly, but there is no other place where to put it */ conntrack->nat.masq_index = exp->master->nat.masq_index; +#endif +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK + conntrack->secmark = exp->master->secmark; #endif nf_conntrack_get(&conntrack->master->ct_general); CONNTRACK_STAT_INC(expect_new); diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 6cb9b989d14c..88445aac3f28 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v) return -ENOSPC; #endif +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK + if (seq_printf(s, "secmark=%u ", conntrack->secmark)) + return -ENOSPC; +#endif + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) return -ENOSPC; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 10eccdd4d6ea..023f81e5f96b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -60,6 +60,18 @@ config NF_CONNTRACK_MARK of packets, but this mark value is kept in the conntrack session instead of the individual packets. +config NF_CONNTRACK_SECMARK + bool 'Connection tracking security mark support' + depends on NF_CONNTRACK && NETWORK_SECMARK + help + This option enables security markings to be applied to + connections. Typically they are copied to connections from + packets using the CONNSECMARK target and copied back from + connections to packets with the same target, with the packets + being originally labeled via SECMARK. + + If unsure, say 'N'. + config NF_CONNTRACK_EVENTS bool "Connection tracking events (EXPERIMENTAL)" depends on EXPERIMENTAL && NF_CONNTRACK diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index bc2bd4c3859e..cd299f4b7db1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -989,6 +989,9 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, conntrack->master = exp->master; #ifdef CONFIG_NF_CONNTRACK_MARK conntrack->mark = exp->master->mark; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + conntrack->secmark = exp->master->secmark; #endif nf_conntrack_get(&conntrack->master->ct_general); NF_CT_STAT_INC(expect_new); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index e01d20d8e287..e34c574f0351 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -213,6 +213,11 @@ static int ct_seq_show(struct seq_file *s, void *v) return -ENOSPC; #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + if (seq_printf(s, "secmark=%u ", conntrack->secmark)) + return -ENOSPC; +#endif + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) return -ENOSPC; -- cgit v1.2.3 From a0e889bb1bdc083dbbdb02cce9698765847b841f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 9 Jun 2006 12:17:41 -0700 Subject: [NETFILTER]: recent match: fix "sleeping function called from invalid context" create_proc_entry must not be called with locks held. Use a mutex instead to protect data only changed in user context. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 9686c4d2057d..9b09e481957d 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -69,6 +69,7 @@ struct recent_table { static LIST_HEAD(tables); static DEFINE_SPINLOCK(recent_lock); +static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS static struct proc_dir_entry *proc_dir; @@ -249,7 +250,7 @@ ipt_recent_checkentry(const char *tablename, const void *ip, strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) return 0; - spin_lock_bh(&recent_lock); + mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); if (t != NULL) { t->refcnt++; @@ -258,7 +259,7 @@ ipt_recent_checkentry(const char *tablename, const void *ip, } t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, - GFP_ATOMIC); + GFP_KERNEL); if (t == NULL) goto out; strcpy(t->name, info->name); @@ -274,10 +275,12 @@ ipt_recent_checkentry(const char *tablename, const void *ip, t->proc->proc_fops = &recent_fops; t->proc->data = t; #endif + spin_lock_bh(&recent_lock); list_add_tail(&t->list, &tables); + spin_unlock_bh(&recent_lock); ret = 1; out: - spin_unlock_bh(&recent_lock); + mutex_unlock(&recent_mutex); return ret; } @@ -288,17 +291,19 @@ ipt_recent_destroy(const struct xt_match *match, void *matchinfo, const struct ipt_recent_info *info = matchinfo; struct recent_table *t; - spin_lock_bh(&recent_lock); + mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); if (--t->refcnt == 0) { + spin_lock_bh(&recent_lock); list_del(&t->list); + spin_unlock_bh(&recent_lock); recent_table_flush(t); #ifdef CONFIG_PROC_FS remove_proc_entry(t->name, proc_dir); #endif kfree(t); } - spin_unlock_bh(&recent_lock); + mutex_unlock(&recent_mutex); } #ifdef CONFIG_PROC_FS -- cgit v1.2.3 From 2b2283d0302d520f08ded41c2ca17886dfbb865a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 9 Jun 2006 12:18:17 -0700 Subject: [NETFILTER]: recent match: missing refcnt initialization Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 9b09e481957d..61a2139f9cfd 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -262,6 +262,7 @@ ipt_recent_checkentry(const char *tablename, const void *ip, GFP_KERNEL); if (t == NULL) goto out; + t->refcnt = 1; strcpy(t->name, info->name); INIT_LIST_HEAD(&t->lru_list); for (i = 0; i < ip_list_hash_size; i++) -- cgit v1.2.3 From bf0857ea32addb6bc8b46383604b218b8ec09f19 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 9 Jun 2006 12:18:47 -0700 Subject: [NETFILTER]: hashlimit match: fix random initialization hashlimit does: if (!ht->rnd) get_random_bytes(&ht->rnd, 4); ignoring that 0 is also a valid random number. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_hashlimit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 85edfb79469a..92980ab8ce48 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -80,6 +80,7 @@ struct ipt_hashlimit_htable { /* used internally */ spinlock_t lock; /* lock for list_head */ u_int32_t rnd; /* random seed for hash */ + int rnd_initialized; struct timer_list timer; /* timer for gc */ atomic_t count; /* number entries in table */ @@ -134,8 +135,10 @@ __dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) /* initialize hash with random val at the time we allocate * the first hashtable entry */ - if (!ht->rnd) + if (!ht->rnd_initialized) { get_random_bytes(&ht->rnd, 4); + ht->rnd_initialized = 1; + } if (ht->cfg.max && atomic_read(&ht->count) >= ht->cfg.max) { @@ -214,7 +217,7 @@ static int htable_create(struct ipt_hashlimit_info *minfo) atomic_set(&hinfo->count, 0); atomic_set(&hinfo->use, 1); - hinfo->rnd = 0; + hinfo->rnd_initialized = 0; spin_lock_init(&hinfo->lock); hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); if (!hinfo->pde) { -- cgit v1.2.3 From 364c6badde0dd62a0a38e5ed67f85d87d6665780 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 9 Jun 2006 16:10:40 -0700 Subject: [NET]: Clean up skb_linearize The linearisation operation doesn't need to be super-optimised. So we can replace __skb_linearize with __pskb_pull_tail which does the same thing but is more general. Also, most users of skb_linearize end up testing whether the skb is linear or not so it helps to make skb_linearize do just that. Some callers of skb_linearize also use it to copy cloned data, so it's useful to have a new function skb_linearize_cow to copy the data if it's either non-linear or cloned. Last but not least, I've removed the gfp argument since nobody uses it anymore. If it's ever needed we can easily add it back. Misc bugs fixed by this patch: * via-velocity error handling (also, no SG => no frags) Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- drivers/block/aoe/aoenet.c | 3 +-- drivers/net/mv643xx_eth.c | 2 +- drivers/net/via-velocity.c | 10 +++++--- include/linux/skbuff.h | 24 +++++++++++++++--- net/core/dev.c | 63 ++-------------------------------------------- net/decnet/dn_nsp_in.c | 3 +-- net/decnet/dn_route.c | 3 +-- net/ipv4/ipcomp.c | 11 +++----- net/ipv6/ipcomp6.c | 11 +++----- 9 files changed, 39 insertions(+), 91 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index fdff774b8ab9..c1434ed11880 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -116,8 +116,7 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) return 0; - if (skb_is_nonlinear(skb)) - if (skb_linearize(skb, GFP_ATOMIC) < 0) + if (skb_linearize(skb)) goto exit; if (!is_aoe_netif(ifp)) goto exit; diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c index 411f4d809c47..625ff61c9988 100644 --- a/drivers/net/mv643xx_eth.c +++ b/drivers/net/mv643xx_eth.c @@ -1200,7 +1200,7 @@ static int mv643xx_eth_start_xmit(struct sk_buff *skb, struct net_device *dev) } if (has_tiny_unaligned_frags(skb)) { - if ((skb_linearize(skb, GFP_ATOMIC) != 0)) { + if (__skb_linearize(skb)) { stats->tx_dropped++; printk(KERN_DEBUG "%s: failed to linearize tiny " "unaligned fragment\n", dev->name); diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c index ed1f837c8fda..2eb6b5f9ba0d 100644 --- a/drivers/net/via-velocity.c +++ b/drivers/net/via-velocity.c @@ -1899,6 +1899,13 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev) int pktlen = skb->len; +#ifdef VELOCITY_ZERO_COPY_SUPPORT + if (skb_shinfo(skb)->nr_frags > 6 && __skb_linearize(skb)) { + kfree_skb(skb); + return 0; + } +#endif + spin_lock_irqsave(&vptr->lock, flags); index = vptr->td_curr[qnum]; @@ -1914,8 +1921,6 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev) */ if (pktlen < ETH_ZLEN) { /* Cannot occur until ZC support */ - if(skb_linearize(skb, GFP_ATOMIC)) - return 0; pktlen = ETH_ZLEN; memcpy(tdinfo->buf, skb->data, skb->len); memset(tdinfo->buf + skb->len, 0, ETH_ZLEN - skb->len); @@ -1933,7 +1938,6 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev) int nfrags = skb_shinfo(skb)->nr_frags; tdinfo->skb = skb; if (nfrags > 6) { - skb_linearize(skb, GFP_ATOMIC); memcpy(tdinfo->buf, skb->data, skb->len); tdinfo->skb_dma[0] = tdinfo->buf_dma; td_ptr->tdesc0.pktsize = diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fe2c58e5306f..830f58fa03a2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1169,18 +1169,34 @@ static inline int skb_can_coalesce(struct sk_buff *skb, int i, return 0; } +static inline int __skb_linearize(struct sk_buff *skb) +{ + return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM; +} + /** * skb_linearize - convert paged skb to linear one * @skb: buffer to linarize - * @gfp: allocation mode * * If there is no free memory -ENOMEM is returned, otherwise zero * is returned and the old skb data released. */ -extern int __skb_linearize(struct sk_buff *skb, gfp_t gfp); -static inline int skb_linearize(struct sk_buff *skb, gfp_t gfp) +static inline int skb_linearize(struct sk_buff *skb) +{ + return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0; +} + +/** + * skb_linearize_cow - make sure skb is linear and writable + * @skb: buffer to process + * + * If there is no free memory -ENOMEM is returned, otherwise zero + * is returned and the old skb data released. + */ +static inline int skb_linearize_cow(struct sk_buff *skb) { - return __skb_linearize(skb, gfp); + return skb_is_nonlinear(skb) || skb_cloned(skb) ? + __skb_linearize(skb) : 0; } /** diff --git a/net/core/dev.c b/net/core/dev.c index 1b09f1cae46e..91361bc2b682 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1222,64 +1222,6 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) #define illegal_highdma(dev, skb) (0) #endif -/* Keep head the same: replace data */ -int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask) -{ - unsigned int size; - u8 *data; - long offset; - struct skb_shared_info *ninfo; - int headerlen = skb->data - skb->head; - int expand = (skb->tail + skb->data_len) - skb->end; - - if (skb_shared(skb)) - BUG(); - - if (expand <= 0) - expand = 0; - - size = skb->end - skb->head + expand; - size = SKB_DATA_ALIGN(size); - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (!data) - return -ENOMEM; - - /* Copy entire thing */ - if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) - BUG(); - - /* Set up shinfo */ - ninfo = (struct skb_shared_info*)(data + size); - atomic_set(&ninfo->dataref, 1); - ninfo->tso_size = skb_shinfo(skb)->tso_size; - ninfo->tso_segs = skb_shinfo(skb)->tso_segs; - ninfo->nr_frags = 0; - ninfo->frag_list = NULL; - - /* Offset between the two in bytes */ - offset = data - skb->head; - - /* Free old data. */ - skb_release_data(skb); - - skb->head = data; - skb->end = data + size; - - /* Set up new pointers */ - skb->h.raw += offset; - skb->nh.raw += offset; - skb->mac.raw += offset; - skb->tail += offset; - skb->data += offset; - - /* We are no longer a clone, even if we were. */ - skb->cloned = 0; - - skb->tail += skb->data_len; - skb->data_len = 0; - return 0; -} - #define HARD_TX_LOCK(dev, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ netif_tx_lock(dev); \ @@ -1326,7 +1268,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (skb_shinfo(skb)->frag_list && !(dev->features & NETIF_F_FRAGLIST) && - __skb_linearize(skb, GFP_ATOMIC)) + __skb_linearize(skb)) goto out_kfree_skb; /* Fragmented skb is linearized if device does not support SG, @@ -1335,7 +1277,7 @@ int dev_queue_xmit(struct sk_buff *skb) */ if (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && - __skb_linearize(skb, GFP_ATOMIC)) + __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not support @@ -3473,7 +3415,6 @@ subsys_initcall(net_dev_init); EXPORT_SYMBOL(__dev_get_by_index); EXPORT_SYMBOL(__dev_get_by_name); EXPORT_SYMBOL(__dev_remove_pack); -EXPORT_SYMBOL(__skb_linearize); EXPORT_SYMBOL(dev_valid_name); EXPORT_SYMBOL(dev_add_pack); EXPORT_SYMBOL(dev_alloc_name); diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 547523b41c81..a2ba9db1c376 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -801,8 +801,7 @@ got_it: * We linearize everything except data segments here. */ if (cb->nsp_flags & ~0x60) { - if (unlikely(skb_is_nonlinear(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) + if (unlikely(skb_linearize(skb))) goto free_out; } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index e172cf98d7fc..5abf7057af00 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -629,8 +629,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type padlen); if (flags & DN_RT_PKT_CNTL) { - if (unlikely(skb_is_nonlinear(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) + if (unlikely(skb_linearize(skb))) goto dump_it; switch(flags & DN_RT_CNTL_MSK) { diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 8e243589045f..3ed8b57a1002 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -80,15 +80,12 @@ out: static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { - int err = 0; + int err = -ENOMEM; struct iphdr *iph; struct ip_comp_hdr *ipch; - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { - err = -ENOMEM; + if (skb_linearize_cow(skb)) goto out; - } skb->ip_summed = CHECKSUM_NONE; @@ -158,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) goto out_ok; } - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { + if (skb_linearize_cow(skb)) goto out_ok; - } err = ipcomp_compress(x, skb); iph = skb->nh.iph; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index cec3be544b69..f28cd37feed3 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -65,7 +65,7 @@ static LIST_HEAD(ipcomp6_tfms_list); static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) { - int err = 0; + int err = -ENOMEM; struct ipv6hdr *iph; struct ipv6_comp_hdr *ipch; int plen, dlen; @@ -74,11 +74,8 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) struct crypto_tfm *tfm; int cpu; - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { - err = -ENOMEM; + if (skb_linearize_cow(skb)) goto out; - } skb->ip_summed = CHECKSUM_NONE; @@ -142,10 +139,8 @@ static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb) goto out_ok; } - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { + if (skb_linearize_cow(skb)) goto out_ok; - } /* compression */ plen = skb->len - hdr_len; -- cgit v1.2.3 From bdeb04c6d9a957ae2a51c3033414467b82b2a736 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 11 Jun 2006 21:20:38 -0700 Subject: [NET]: net.ipv4.ip_autoconfig sysctl removal The sysctl net.ipv4.ip_autoconfig is a legacy value that is not used. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/ip.h | 1 - net/ipv4/sysctl_net_ipv4.c | 8 -------- 2 files changed, 9 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip.h b/include/net/ip.h index 3d2e5ca62a5a..ead233c9540d 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -147,7 +147,6 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar struct ipv4_config { int log_martians; - int autoconfig; int no_pmtu_disc; }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6a6aa537b7aa..e7fb665873c6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -181,14 +181,6 @@ ctl_table ipv4_table[] = { .proc_handler = &ipv4_doint_and_flush, .strategy = &ipv4_doint_and_flush_strategy, }, - { - .ctl_name = NET_IPV4_AUTOCONFIG, - .procname = "ip_autoconfig", - .data = &ipv4_config.autoconfig, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, { .ctl_name = NET_IPV4_NO_PMTU_DISC, .procname = "ip_no_pmtu_disc", -- cgit v1.2.3 From f61e29018a30c738e1298e1b13be956aa17ee17b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 11 Jun 2006 23:01:02 -0700 Subject: [TCP] Westwood: fix first sample Need to update send sequence number tracking after first ack. Rework of patch from Luca De Cicco. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 29eb258b6d82..62a96b71b0dd 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -22,6 +22,7 @@ struct westwood { u32 accounted; u32 rtt; u32 rtt_min; /* minimum observed RTT */ + u8 first_ack; /* flag which infers that this is the first ack */ }; @@ -52,6 +53,7 @@ static void tcp_westwood_init(struct sock *sk) w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_time_stamp; w->snd_una = tcp_sk(sk)->snd_una; + w->first_ack = 1; } /* @@ -91,6 +93,15 @@ static void westwood_update_window(struct sock *sk) struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_time_stamp - w->rtt_win_sx; + /* Initialise w->snd_una with the first acked sequence number in order + * to fix mismatch between tp->snd_una and w->snd_una for the first + * bandwidth sample + */ + if (w->first_ack) { + w->snd_una = tcp_sk(sk)->snd_una; + w->first_ack = 0; + } + /* * See if a RTT-window has passed. * Be careful since if RTT is less than @@ -180,7 +191,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); - + switch(event) { case CA_EVENT_FAST_ACK: westwood_fast_bw(sk); -- cgit v1.2.3 From b7d7a9e3c900f0733bf2aabdd41e6dbc70eae94b Mon Sep 17 00:00:00 2001 From: Luca De Cicco Date: Sun, 11 Jun 2006 23:01:39 -0700 Subject: [TCP] Westwood: comment fixes Cleanup some comments and add more references Signed-off-by: Luca De Cicco Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 62a96b71b0dd..12a2cd976e3a 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -1,7 +1,24 @@ /* - * TCP Westwood+ + * TCP Westwood+: end-to-end bandwidth estimation for TCP * - * Angelo Dell'Aera: TCP Westwood+ support + * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4 + * + * Support at http://c3lab.poliba.it/index.php/Westwood + * Main references in literature: + * + * - Mascolo S, Casetti, M. Gerla et al. + * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001 + * + * - A. Grieco, s. Mascolo + * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer + * Comm. Review, 2004 + * + * - A. Dell'Aera, L. Grieco, S. Mascolo. + * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving : + * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004 + * + * Westwood+ employs end-to-end bandwidth measurement to set cwnd and + * ssthresh after packet loss. The probing phase is as the original Reno. */ #include @@ -93,7 +110,7 @@ static void westwood_update_window(struct sock *sk) struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_time_stamp - w->rtt_win_sx; - /* Initialise w->snd_una with the first acked sequence number in order + /* Initialize w->snd_una with the first acked sequence number in order * to fix mismatch between tp->snd_una and w->snd_una for the first * bandwidth sample */ @@ -191,7 +208,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); - + switch(event) { case CA_EVENT_FAST_ACK: westwood_fast_bw(sk); -- cgit v1.2.3 From b3a92eabe5b67bd207a38ae13dd51f4e08c1f6f7 Mon Sep 17 00:00:00 2001 From: Luca De Cicco Date: Sun, 11 Jun 2006 23:01:59 -0700 Subject: [TCP] Westwood: bandwidth filter startup The bandwidth estimate filter is now initialized with the first sample in order to have better performances in the case of small file transfers. Signed-off-by: Luca De Cicco Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 12a2cd976e3a..edf2ee19a8ba 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -82,10 +82,16 @@ static inline u32 westwood_do_filter(u32 a, u32 b) return (((7 * a) + b) >> 3); } -static inline void westwood_filter(struct westwood *w, u32 delta) +static void westwood_filter(struct westwood *w, u32 delta) { - w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); - w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); + /* If the filter is empty fill it with the first sample of bandwidth */ + if (w->bw_ns_est == 0 && w->bw_est == 0) { + w->bw_ns_est = w->bk / delta; + w->bw_est = w->bw_ns_est; + } else { + w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); + w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); + } } /* -- cgit v1.2.3 From bc726a71d2799f0f8b68a17f49d86aa030f64abc Mon Sep 17 00:00:00 2001 From: Luca De Cicco Date: Sun, 11 Jun 2006 23:02:19 -0700 Subject: [TCP] Westwood: reset RTT min after FRTO RTT_min is updated each time a timeout event occurs in order to cope with hard handovers in wireless scenarios such as UMTS. Signed-off-by: Luca De Cicco Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index edf2ee19a8ba..4247da1384bf 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -40,6 +40,7 @@ struct westwood { u32 rtt; u32 rtt_min; /* minimum observed RTT */ u8 first_ack; /* flag which infers that this is the first ack */ + u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ }; @@ -67,6 +68,7 @@ static void tcp_westwood_init(struct sock *sk) w->bw_est = 0; w->accounted = 0; w->cumul_ack = 0; + w->reset_rtt_min = 1; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_time_stamp; w->snd_una = tcp_sk(sk)->snd_una; @@ -142,6 +144,16 @@ static void westwood_update_window(struct sock *sk) } } +static inline void update_rtt_min(struct westwood *w) +{ + if (w->reset_rtt_min) { + w->rtt_min = w->rtt; + w->reset_rtt_min = 0; + } else + w->rtt_min = min(w->rtt, w->rtt_min); +} + + /* * @westwood_fast_bw * It is called when we are in fast path. In particular it is called when @@ -157,7 +169,7 @@ static inline void westwood_fast_bw(struct sock *sk) w->bk += tp->snd_una - w->snd_una; w->snd_una = tp->snd_una; - w->rtt_min = min(w->rtt, w->rtt_min); + update_rtt_min(w); } /* @@ -226,12 +238,14 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) case CA_EVENT_FRTO: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); + /* Update RTT_min when next ack arrives */ + w->reset_rtt_min = 1; break; case CA_EVENT_SLOW_ACK: westwood_update_window(sk); w->bk += westwood_acked_count(sk); - w->rtt_min = min(w->rtt, w->rtt_min); + update_rtt_min(w); break; default: -- cgit v1.2.3 From 35089bb203f44e33b6bbb6c4de0b0708f9a48921 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 13 Jun 2006 22:33:04 -0700 Subject: [TCP]: Add tcp_slow_start_after_idle sysctl. A lot of people have asked for a way to disable tcp_cwnd_restart(), and it seems reasonable to add a sysctl to do that. Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +++++++ include/linux/sysctl.h | 1 + include/net/tcp.h | 1 + net/ipv4/sysctl_net_ipv4.c | 8 ++++++++ net/ipv4/tcp_output.c | 6 +++++- 5 files changed, 22 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index f12007b80a46..d46338af6002 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -362,6 +362,13 @@ tcp_workaround_signed_windows - BOOLEAN not receive a window scaling option from them. Default: 0 +tcp_slow_start_after_idle - BOOLEAN + If set, provide RFC2861 behavior and time out the congestion + window after an idle period. An idle period is defined at + the current RTO. If unset, the congestion window will not + be timed out after an idle period. + Default: 1 + IP Variables: ip_local_port_range - 2 INTEGERS diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 98338ed2c0b6..cee944dbdcd4 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -405,6 +405,7 @@ enum NET_TCP_BASE_MSS=114, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115, NET_TCP_DMA_COPYBREAK=116, + NET_TCP_SLOW_START_AFTER_IDLE=117, }; enum { diff --git a/include/net/tcp.h b/include/net/tcp.h index de88c5472bfc..bfc71f954bbe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -227,6 +227,7 @@ extern int sysctl_tcp_abc; extern int sysctl_tcp_mtu_probing; extern int sysctl_tcp_base_mss; extern int sysctl_tcp_workaround_signed_windows; +extern int sysctl_tcp_slow_start_after_idle; extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e7fb665873c6..ce4cd5f35511 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -690,6 +690,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, #endif + { + .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE, + .procname = "tcp_slow_start_after_idle", + .data = &sysctl_tcp_slow_start_after_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f33c9dddaa12..07bb5a2b375e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3; int sysctl_tcp_mtu_probing = 0; int sysctl_tcp_base_mss = 512; +/* By default, RFC2861 behavior. */ +int sysctl_tcp_slow_start_after_idle = 1; + static void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { @@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; - if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) + if (sysctl_tcp_slow_start_after_idle && + (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) tcp_cwnd_restart(sk, __sk_dst_get(sk)); tp->lsndtime = now; -- cgit v1.2.3 From 8648b3053bff39a7ee4c711d74268079c928a657 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 17 Jun 2006 22:06:05 -0700 Subject: [NET]: Add NETIF_F_GEN_CSUM and NETIF_F_ALL_CSUM The current stack treats NETIF_F_HW_CSUM and NETIF_F_NO_CSUM identically so we test for them in quite a few places. For the sake of brevity, I'm adding the macro NETIF_F_GEN_CSUM for these two. We also test the disjunct of NETIF_F_IP_CSUM and the other two in various places, for that purpose I've added NETIF_F_ALL_CSUM. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 7 ++----- include/linux/netdevice.h | 3 +++ net/bridge/br_if.c | 3 +-- net/core/dev.c | 6 ++---- net/core/ethtool.c | 6 ++---- net/ipv4/ip_output.c | 2 +- net/ipv4/tcp.c | 10 ++-------- 7 files changed, 13 insertions(+), 24 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 46326cdfb277..8171cae06688 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1199,8 +1199,7 @@ int bond_sethwaddr(struct net_device *bond_dev, struct net_device *slave_dev) } #define BOND_INTERSECT_FEATURES \ - (NETIF_F_SG|NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM|\ - NETIF_F_TSO|NETIF_F_UFO) + (NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_TSO | NETIF_F_UFO) /* * Compute the common dev->feature set available to all slaves. Some @@ -1218,9 +1217,7 @@ static int bond_compute_features(struct bonding *bond) features &= (slave->dev->features & BOND_INTERSECT_FEATURES); if ((features & NETIF_F_SG) && - !(features & (NETIF_F_IP_CSUM | - NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM))) + !(features & NETIF_F_ALL_CSUM)) features &= ~NETIF_F_SG; /* diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 067b9ccafd87..e432b743dda2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -312,6 +312,9 @@ struct net_device #define NETIF_F_LLTX 4096 /* LockLess TX */ #define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/ +#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) +#define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM) + struct net_device *next_sched; /* Interface index. Unique device identifier */ diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f5d47bf4f967..90c95f59dc82 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -376,8 +376,7 @@ void br_features_recompute(struct net_bridge *br) checksum = br->feature_mask & NETIF_F_IP_CSUM; list_for_each_entry(p, &br->port_list, list) { - if (!(p->dev->features - & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM))) + if (!(p->dev->features & NETIF_F_ALL_CSUM)) checksum = 0; features &= p->dev->features; } diff --git a/net/core/dev.c b/net/core/dev.c index 91361bc2b682..ab39fe17cb58 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1284,7 +1284,7 @@ int dev_queue_xmit(struct sk_buff *skb) * checksumming for this protocol, complete checksumming here. */ if (skb->ip_summed == CHECKSUM_HW && - (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && + (!(dev->features & NETIF_F_GEN_CSUM) && (!(dev->features & NETIF_F_IP_CSUM) || skb->protocol != htons(ETH_P_IP)))) if (skb_checksum_help(skb, 0)) @@ -2789,9 +2789,7 @@ int register_netdevice(struct net_device *dev) /* Fix illegal SG+CSUM combinations. */ if ((dev->features & NETIF_F_SG) && - !(dev->features & (NETIF_F_IP_CSUM | - NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM))) { + !(dev->features & NETIF_F_ALL_CSUM)) { printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", dev->name); dev->features &= ~NETIF_F_SG; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e6f76106a99b..3f269e4e9438 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -30,7 +30,7 @@ u32 ethtool_op_get_link(struct net_device *dev) u32 ethtool_op_get_tx_csum(struct net_device *dev) { - return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0; + return (dev->features & NETIF_F_ALL_CSUM) != 0; } int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) @@ -551,9 +551,7 @@ static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) return -EFAULT; if (edata.data && - !(dev->features & (NETIF_F_IP_CSUM | - NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM))) + !(dev->features & NETIF_F_ALL_CSUM)) return -EINVAL; return __ethtool_set_sg(dev, edata.data); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d4bb3fae4e49..8538aac3d148 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -840,7 +840,7 @@ int ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && + rt->u.dst.dev->features & NETIF_F_ALL_CSUM && !exthdrlen) csummode = CHECKSUM_HW; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ff6ccda9ff46..74998f250071 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -622,14 +622,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, ssize_t res; struct sock *sk = sock->sk; -#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) - if (!(sk->sk_route_caps & NETIF_F_SG) || - !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) + !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) return sock_no_sendpage(sock, page, offset, size, flags); -#undef TCP_ZC_CSUM_FLAGS - lock_sock(sk); TCP_CHECK_TIMER(sk); res = do_tcp_sendpages(sk, &page, offset, size, flags); @@ -726,9 +722,7 @@ new_segment: /* * Check whether we can use HW checksum. */ - if (sk->sk_route_caps & - (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM)) + if (sk->sk_route_caps & NETIF_F_ALL_CSUM) skb->ip_summed = CHECKSUM_HW; skb_entail(sk, tp, skb); -- cgit v1.2.3 From 7967168cefdbc63bf332d6b1548eca7cd65ebbcc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 02:40:14 -0700 Subject: [NET]: Merge TSO/UFO fields in sk_buff Having separate fields in sk_buff for TSO/UFO (tso_size/ufo_size) is not going to scale if we add any more segmentation methods (e.g., DCCP). So let's merge them. They were used to tell the protocol of a packet. This function has been subsumed by the new gso_type field. This is essentially a set of netdev feature bits (shifted by 16 bits) that are required to process a specific skb. As such it's easy to tell whether a given device can process a GSO skb: you just have to and the gso_type field and the netdev's features field. I've made gso_type a conjunction. The idea is that you have a base type (e.g., SKB_GSO_TCPV4) that can be modified further to support new features. For example, if we add a hardware TSO type that supports ECN, they would declare NETIF_F_TSO | NETIF_F_TSO_ECN. All TSO packets with CWR set would have a gso_type of SKB_GSO_TCPV4 | SKB_GSO_TCPV4_ECN while all other TSO packets would be SKB_GSO_TCPV4. This means that only the CWR packets need to be emulated in software. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- drivers/net/8139cp.c | 2 +- drivers/net/bnx2.c | 4 ++-- drivers/net/chelsio/sge.c | 4 ++-- drivers/net/e1000/e1000_main.c | 10 ++++----- drivers/net/forcedeth.c | 4 ++-- drivers/net/ixgb/ixgb_main.c | 4 ++-- drivers/net/loopback.c | 4 ++-- drivers/net/myri10ge/myri10ge.c | 4 ++-- drivers/net/r8169.c | 2 +- drivers/net/s2io.c | 16 +++++++------- drivers/net/sky2.c | 4 ++-- drivers/net/tg3.c | 4 ++-- drivers/net/typhoon.c | 2 +- drivers/s390/net/qeth_eddp.c | 12 +++++------ drivers/s390/net/qeth_main.c | 4 ++-- drivers/s390/net/qeth_tso.h | 2 +- include/linux/netdevice.h | 14 ++++++++++-- include/linux/skbuff.h | 12 ++++++++--- include/net/tcp.h | 4 ++-- net/bridge/br_forward.c | 4 ++-- net/bridge/br_netfilter.c | 2 +- net/core/skbuff.c | 16 ++++++++------ net/ipv4/ip_output.c | 16 ++++++++------ net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 47 ++++++++++++++++++++++++----------------- net/ipv6/ip6_output.c | 7 +++--- 27 files changed, 120 insertions(+), 90 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c index a26077a175ad..0cdc830449d8 100644 --- a/drivers/net/8139cp.c +++ b/drivers/net/8139cp.c @@ -797,7 +797,7 @@ static int cp_start_xmit (struct sk_buff *skb, struct net_device *dev) entry = cp->tx_head; eor = (entry == (CP_TX_RING_SIZE - 1)) ? RingEnd : 0; if (dev->features & NETIF_F_TSO) - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (skb_shinfo(skb)->nr_frags == 0) { struct cp_desc *txd = &cp->tx_ring[entry]; diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 702d546567ad..7635736cc791 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -1640,7 +1640,7 @@ bnx2_tx_int(struct bnx2 *bp) skb = tx_buf->skb; #ifdef BCM_TSO /* partial BD completions possible with TSO packets */ - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { u16 last_idx, last_ring_idx; last_idx = sw_cons + @@ -4428,7 +4428,7 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) (TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16)); } #ifdef BCM_TSO - if ((mss = skb_shinfo(skb)->tso_size) && + if ((mss = skb_shinfo(skb)->gso_size) && (skb->len > (bp->dev->mtu + ETH_HLEN))) { u32 tcp_opt_len, ip_tcp_len; diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c index 4391bf4bf573..53efff6da784 100644 --- a/drivers/net/chelsio/sge.c +++ b/drivers/net/chelsio/sge.c @@ -1418,7 +1418,7 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev) struct cpl_tx_pkt *cpl; #ifdef NETIF_F_TSO - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { int eth_type; struct cpl_tx_pkt_lso *hdr; @@ -1433,7 +1433,7 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev) hdr->ip_hdr_words = skb->nh.iph->ihl; hdr->tcp_hdr_words = skb->h.th->doff; hdr->eth_type_mss = htons(MK_ETH_TYPE_MSS(eth_type, - skb_shinfo(skb)->tso_size)); + skb_shinfo(skb)->gso_size)); hdr->len = htonl(skb->len - sizeof(*hdr)); cpl = (struct cpl_tx_pkt *)hdr; sge->stats.tx_lso_pkts++; diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index a373ccb308d8..32b7d444b374 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -2394,7 +2394,7 @@ e1000_tso(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, uint8_t ipcss, ipcso, tucss, tucso, hdr_len; int err; - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { if (skb_header_cloned(skb)) { err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); if (err) @@ -2402,7 +2402,7 @@ e1000_tso(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, } hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2)); - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (skb->protocol == htons(ETH_P_IP)) { skb->nh.iph->tot_len = 0; skb->nh.iph->check = 0; @@ -2519,7 +2519,7 @@ e1000_tx_map(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring, * tso gets written back prematurely before the data is fully * DMA'd to the controller */ if (!skb->data_len && tx_ring->last_tx_tso && - !skb_shinfo(skb)->tso_size) { + !skb_shinfo(skb)->gso_size) { tx_ring->last_tx_tso = 0; size -= 4; } @@ -2757,7 +2757,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) } #ifdef NETIF_F_TSO - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; /* The controller does a simple calculation to * make sure there is enough room in the FIFO before * initiating the DMA for each buffer. The calc is: @@ -2807,7 +2807,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) #ifdef NETIF_F_TSO /* Controller Erratum workaround */ if (!skb->data_len && tx_ring->last_tx_tso && - !skb_shinfo(skb)->tso_size) + !skb_shinfo(skb)->gso_size) count++; #endif diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index 191383d461d7..21be4fa071b5 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -1495,8 +1495,8 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev) np->tx_skbuff[nr] = skb; #ifdef NETIF_F_TSO - if (skb_shinfo(skb)->tso_size) - tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->tso_size << NV_TX2_TSO_SHIFT); + if (skb_shinfo(skb)->gso_size) + tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT); else #endif tx_flags_extra = (skb->ip_summed == CHECKSUM_HW ? (NV_TX2_CHECKSUM_L3|NV_TX2_CHECKSUM_L4) : 0); diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c index 57006fb8840e..8bb32f946993 100644 --- a/drivers/net/ixgb/ixgb_main.c +++ b/drivers/net/ixgb/ixgb_main.c @@ -1173,7 +1173,7 @@ ixgb_tso(struct ixgb_adapter *adapter, struct sk_buff *skb) uint16_t ipcse, tucse, mss; int err; - if(likely(skb_shinfo(skb)->tso_size)) { + if(likely(skb_shinfo(skb)->gso_size)) { if (skb_header_cloned(skb)) { err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); if (err) @@ -1181,7 +1181,7 @@ ixgb_tso(struct ixgb_adapter *adapter, struct sk_buff *skb) } hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2)); - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; skb->nh.iph->tot_len = 0; skb->nh.iph->check = 0; skb->h.th->check = ~csum_tcpudp_magic(skb->nh.iph->saddr, diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index b79d6e8d3045..43fef7de8cb9 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -74,7 +74,7 @@ static void emulate_large_send_offload(struct sk_buff *skb) struct iphdr *iph = skb->nh.iph; struct tcphdr *th = (struct tcphdr*)(skb->nh.raw + (iph->ihl * 4)); unsigned int doffset = (iph->ihl + th->doff) * 4; - unsigned int mtu = skb_shinfo(skb)->tso_size + doffset; + unsigned int mtu = skb_shinfo(skb)->gso_size + doffset; unsigned int offset = 0; u32 seq = ntohl(th->seq); u16 id = ntohs(iph->id); @@ -139,7 +139,7 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev) #endif #ifdef LOOPBACK_TSO - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { BUG_ON(skb->protocol != htons(ETH_P_IP)); BUG_ON(skb->nh.iph->protocol != IPPROTO_TCP); diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index b983e1e04348..dbdf189436fa 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -1879,7 +1879,7 @@ again: #ifdef NETIF_F_TSO if (skb->len > (dev->mtu + ETH_HLEN)) { - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (mss != 0) max_segments = MYRI10GE_MAX_SEND_DESC_TSO; } @@ -2112,7 +2112,7 @@ abort_linearize: } idx = (idx + 1) & tx->mask; } while (idx != last_idx); - if (skb_shinfo(skb)->tso_size) { + if (skb_shinfo(skb)->gso_size) { printk(KERN_ERR "myri10ge: %s: TSO but wanted to linearize?!?!?\n", mgp->dev->name); diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 985afe0e6273..12d1cb289bb0 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -2172,7 +2172,7 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb, static inline u32 rtl8169_tso_csum(struct sk_buff *skb, struct net_device *dev) { if (dev->features & NETIF_F_TSO) { - u32 mss = skb_shinfo(skb)->tso_size; + u32 mss = skb_shinfo(skb)->gso_size; if (mss) return LargeSend | ((mss & MSSMask) << MSSShift); diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c index 11daed495b97..3defe5d4f7d3 100644 --- a/drivers/net/s2io.c +++ b/drivers/net/s2io.c @@ -3959,8 +3959,8 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) txdp->Control_1 = 0; txdp->Control_2 = 0; #ifdef NETIF_F_TSO - mss = skb_shinfo(skb)->tso_size; - if (mss) { + mss = skb_shinfo(skb)->gso_size; + if (skb_shinfo(skb)->gso_type == SKB_GSO_TCPV4) { txdp->Control_1 |= TXD_TCP_LSO_EN; txdp->Control_1 |= TXD_TCP_LSO_MSS(mss); } @@ -3980,10 +3980,10 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) } frg_len = skb->len - skb->data_len; - if (skb_shinfo(skb)->ufo_size) { + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) { int ufo_size; - ufo_size = skb_shinfo(skb)->ufo_size; + ufo_size = skb_shinfo(skb)->gso_size; ufo_size &= ~7; txdp->Control_1 |= TXD_UFO_EN; txdp->Control_1 |= TXD_UFO_MSS(ufo_size); @@ -4009,7 +4009,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) txdp->Host_Control = (unsigned long) skb; txdp->Control_1 |= TXD_BUFFER0_SIZE(frg_len); - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) txdp->Control_1 |= TXD_UFO_EN; frg_cnt = skb_shinfo(skb)->nr_frags; @@ -4024,12 +4024,12 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) (sp->pdev, frag->page, frag->page_offset, frag->size, PCI_DMA_TODEVICE); txdp->Control_1 = TXD_BUFFER0_SIZE(frag->size); - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) txdp->Control_1 |= TXD_UFO_EN; } txdp->Control_1 |= TXD_GATHER_CODE_LAST; - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) frg_cnt++; /* as Txd0 was used for inband header */ tx_fifo = mac_control->tx_FIFO_start[queue]; @@ -4043,7 +4043,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev) if (mss) val64 |= TX_FIFO_SPECIAL_FUNC; #endif - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) val64 |= TX_FIFO_SPECIAL_FUNC; writeq(val64, &tx_fifo->List_Control); diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index fba1e4d4d83d..d3577871be28 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -1160,7 +1160,7 @@ static unsigned tx_le_req(const struct sk_buff *skb) count = sizeof(dma_addr_t) / sizeof(u32); count += skb_shinfo(skb)->nr_frags * count; - if (skb_shinfo(skb)->tso_size) + if (skb_shinfo(skb)->gso_size) ++count; if (skb->ip_summed == CHECKSUM_HW) @@ -1232,7 +1232,7 @@ static int sky2_xmit_frame(struct sk_buff *skb, struct net_device *dev) } /* Check for TCP Segmentation Offload */ - mss = skb_shinfo(skb)->tso_size; + mss = skb_shinfo(skb)->gso_size; if (mss != 0) { /* just drop the packet if non-linear expansion fails */ if (skb_header_cloned(skb) && diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index b2ddd4522a87..e3e380f90f86 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -3780,7 +3780,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) #if TG3_TSO_SUPPORT != 0 mss = 0; if (skb->len > (tp->dev->mtu + ETH_HLEN) && - (mss = skb_shinfo(skb)->tso_size) != 0) { + (mss = skb_shinfo(skb)->gso_size) != 0) { int tcp_opt_len, ip_tcp_len; if (skb_header_cloned(skb) && @@ -3905,7 +3905,7 @@ static int tg3_start_xmit_dma_bug(struct sk_buff *skb, struct net_device *dev) #if TG3_TSO_SUPPORT != 0 mss = 0; if (skb->len > (tp->dev->mtu + ETH_HLEN) && - (mss = skb_shinfo(skb)->tso_size) != 0) { + (mss = skb_shinfo(skb)->gso_size) != 0) { int tcp_opt_len, ip_tcp_len; if (skb_header_cloned(skb) && diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c index d9258d42090c..e49e8b520c28 100644 --- a/drivers/net/typhoon.c +++ b/drivers/net/typhoon.c @@ -340,7 +340,7 @@ enum state_values { #endif #if defined(NETIF_F_TSO) -#define skb_tso_size(x) (skb_shinfo(x)->tso_size) +#define skb_tso_size(x) (skb_shinfo(x)->gso_size) #define TSO_NUM_DESCRIPTORS 2 #define TSO_OFFLOAD_ON TYPHOON_OFFLOAD_TCP_SEGMENT #else diff --git a/drivers/s390/net/qeth_eddp.c b/drivers/s390/net/qeth_eddp.c index 0bab60a20309..38aad8321456 100644 --- a/drivers/s390/net/qeth_eddp.c +++ b/drivers/s390/net/qeth_eddp.c @@ -420,7 +420,7 @@ __qeth_eddp_fill_context_tcp(struct qeth_eddp_context *ctx, } tcph = eddp->skb->h.th; while (eddp->skb_offset < eddp->skb->len) { - data_len = min((int)skb_shinfo(eddp->skb)->tso_size, + data_len = min((int)skb_shinfo(eddp->skb)->gso_size, (int)(eddp->skb->len - eddp->skb_offset)); /* prepare qdio hdr */ if (eddp->qh.hdr.l2.id == QETH_HEADER_TYPE_LAYER2){ @@ -515,20 +515,20 @@ qeth_eddp_calc_num_pages(struct qeth_eddp_context *ctx, struct sk_buff *skb, QETH_DBF_TEXT(trace, 5, "eddpcanp"); /* can we put multiple skbs in one page? */ - skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->tso_size + hdr_len); + skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->gso_size + hdr_len); if (skbs_per_page > 1){ - ctx->num_pages = (skb_shinfo(skb)->tso_segs + 1) / + ctx->num_pages = (skb_shinfo(skb)->gso_segs + 1) / skbs_per_page + 1; ctx->elements_per_skb = 1; } else { /* no -> how many elements per skb? */ - ctx->elements_per_skb = (skb_shinfo(skb)->tso_size + hdr_len + + ctx->elements_per_skb = (skb_shinfo(skb)->gso_size + hdr_len + PAGE_SIZE) >> PAGE_SHIFT; ctx->num_pages = ctx->elements_per_skb * - (skb_shinfo(skb)->tso_segs + 1); + (skb_shinfo(skb)->gso_segs + 1); } ctx->num_elements = ctx->elements_per_skb * - (skb_shinfo(skb)->tso_segs + 1); + (skb_shinfo(skb)->gso_segs + 1); } static inline struct qeth_eddp_context * diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c index 9e671a48cd2f..56009d768326 100644 --- a/drivers/s390/net/qeth_main.c +++ b/drivers/s390/net/qeth_main.c @@ -4417,7 +4417,7 @@ qeth_send_packet(struct qeth_card *card, struct sk_buff *skb) struct qeth_eddp_context *ctx = NULL; int tx_bytes = skb->len; unsigned short nr_frags = skb_shinfo(skb)->nr_frags; - unsigned short tso_size = skb_shinfo(skb)->tso_size; + unsigned short tso_size = skb_shinfo(skb)->gso_size; int rc; QETH_DBF_TEXT(trace, 6, "sendpkt"); @@ -4453,7 +4453,7 @@ qeth_send_packet(struct qeth_card *card, struct sk_buff *skb) queue = card->qdio.out_qs [qeth_get_priority_queue(card, skb, ipv, cast_type)]; - if (skb_shinfo(skb)->tso_size) + if (skb_shinfo(skb)->gso_size) large_send = card->options.large_send; /*are we able to do TSO ? If so ,prepare and send it from here */ diff --git a/drivers/s390/net/qeth_tso.h b/drivers/s390/net/qeth_tso.h index 24ef40ca9562..593f298142c1 100644 --- a/drivers/s390/net/qeth_tso.h +++ b/drivers/s390/net/qeth_tso.h @@ -51,7 +51,7 @@ qeth_tso_fill_header(struct qeth_card *card, struct sk_buff *skb) hdr->ext.hdr_version = 1; hdr->ext.hdr_len = 28; /*insert non-fix values */ - hdr->ext.mss = skb_shinfo(skb)->tso_size; + hdr->ext.mss = skb_shinfo(skb)->gso_size; hdr->ext.dg_hdr_len = (__u16)(iph->ihl*4 + tcph->doff*4); hdr->ext.payload_len = (__u16)(skb->len - hdr->ext.dg_hdr_len - sizeof(struct qeth_hdr_tso)); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cead6be467ed..fa5671307b90 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -308,9 +308,12 @@ struct net_device #define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */ #define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */ #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */ -#define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */ #define NETIF_F_LLTX 4096 /* LockLess TX */ -#define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/ + + /* Segmentation offload features */ +#define NETIF_F_GSO_SHIFT 16 +#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT) +#define NETIF_F_UFO (SKB_GSO_UDPV4 << NETIF_F_GSO_SHIFT) #define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) #define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM) @@ -979,6 +982,13 @@ extern void dev_seq_stop(struct seq_file *seq, void *v); extern void linkwatch_run_queue(void); +static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) +{ + int feature = skb_shinfo(skb)->gso_type << NETIF_F_GSO_SHIFT; + return skb_shinfo(skb)->gso_size && + (dev->features & feature) != feature; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f8c7eb79a27f..97b0d2d1a6b0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -134,9 +134,10 @@ struct skb_frag_struct { struct skb_shared_info { atomic_t dataref; unsigned short nr_frags; - unsigned short tso_size; - unsigned short tso_segs; - unsigned short ufo_size; + unsigned short gso_size; + /* Warning: this field is not always filled in (UFO)! */ + unsigned short gso_segs; + unsigned short gso_type; unsigned int ip6_frag_id; struct sk_buff *frag_list; skb_frag_t frags[MAX_SKB_FRAGS]; @@ -168,6 +169,11 @@ enum { SKB_FCLONE_CLONE, }; +enum { + SKB_GSO_TCPV4 = 1 << 0, + SKB_GSO_UDPV4 = 1 << 1, +}; + /** * struct sk_buff - socket buffer * @next: Next buffer in list diff --git a/include/net/tcp.h b/include/net/tcp.h index 5f4eb5c79689..b197a9e615c1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -569,13 +569,13 @@ struct tcp_skb_cb { */ static inline int tcp_skb_pcount(const struct sk_buff *skb) { - return skb_shinfo(skb)->tso_segs; + return skb_shinfo(skb)->gso_segs; } /* This is valid iff tcp_skb_pcount() > 1. */ static inline int tcp_skb_mss(const struct sk_buff *skb) { - return skb_shinfo(skb)->tso_size; + return skb_shinfo(skb)->gso_size; } static inline void tcp_dec_pcount_approx(__u32 *count, diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 0dca027ceb80..8be9f2123e54 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -34,8 +34,8 @@ static inline unsigned packet_length(const struct sk_buff *skb) int br_dev_queue_push_xmit(struct sk_buff *skb) { - /* drop mtu oversized packets except tso */ - if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->tso_size) + /* drop mtu oversized packets except gso */ + if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->gso_size) kfree_skb(skb); else { #ifdef CONFIG_BRIDGE_NETFILTER diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 3e41f9d6d51c..8298a5179aef 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -761,7 +761,7 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP) && skb->len > skb->dev->mtu && - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) + !skb_shinfo(skb)->gso_size) return ip_fragment(skb, br_dev_queue_push_xmit); else return br_dev_queue_push_xmit(skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fe63d4efbd4d..368d98578c14 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -172,9 +172,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo = skb_shinfo(skb); atomic_set(&shinfo->dataref, 1); shinfo->nr_frags = 0; - shinfo->tso_size = 0; - shinfo->tso_segs = 0; - shinfo->ufo_size = 0; + shinfo->gso_size = 0; + shinfo->gso_segs = 0; + shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; shinfo->frag_list = NULL; @@ -238,8 +238,9 @@ struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->tso_size = 0; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_segs = 0; + skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->frag_list = NULL; out: return skb; @@ -528,8 +529,9 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif skb_copy_secmark(new, old); atomic_set(&new->users, 1); - skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; - skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; + skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; + skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; + skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } /** diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8538aac3d148..7624fd1d8f9f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -210,8 +210,7 @@ static inline int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > dst_mtu(skb->dst) && - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) + if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size) return ip_fragment(skb, ip_finish_output2); else return ip_finish_output2(skb); @@ -362,7 +361,7 @@ packet_routed: } ip_select_ident_more(iph, &rt->u.dst, sk, - (skb_shinfo(skb)->tso_segs ?: 1) - 1); + (skb_shinfo(skb)->gso_segs ?: 1) - 1); /* Add an IP checksum. */ ip_send_check(iph); @@ -744,7 +743,8 @@ static inline int ip_ufo_append_data(struct sock *sk, (length - transhdrlen)); if (!err) { /* specify the length of each IP datagram fragment*/ - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4; __skb_queue_tail(&sk->sk_write_queue, skb); return 0; @@ -1087,14 +1087,16 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, inet->cork.length += size; if ((sk->sk_protocol == IPPROTO_UDP) && - (rt->u.dst.dev->features & NETIF_F_UFO)) - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + (rt->u.dst.dev->features & NETIF_F_UFO)) { + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4; + } while (size > 0) { int i; - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_size) len = size; else { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 74998f250071..062dd1a0d8a8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -571,7 +571,7 @@ new_segment: skb->ip_summed = CHECKSUM_HW; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_segs = 0; if (!copied) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; @@ -818,7 +818,7 @@ new_segment: tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_segs = 0; from += copy; copied += copy; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e08245bdda3a..94fe5b1f9dcb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1073,7 +1073,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ else pkt_len = (end_seq - TCP_SKB_CB(skb)->seq); - if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size)) + if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size)) break; pcount = tcp_skb_pcount(skb); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 07bb5a2b375e..bdd71db8bf90 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -515,15 +515,17 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned /* Avoid the costly divide in the normal * non-TSO case. */ - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; } else { unsigned int factor; factor = skb->len + (mss_now - 1); factor /= mss_now; - skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = mss_now; + skb_shinfo(skb)->gso_segs = factor; + skb_shinfo(skb)->gso_size = mss_now; + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; } } @@ -914,7 +916,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int if (!tso_segs || (tso_segs > 1 && - skb_shinfo(skb)->tso_size != mss_now)) { + tcp_skb_mss(skb) != mss_now)) { tcp_set_skb_tso_segs(sk, skb, mss_now); tso_segs = tcp_skb_pcount(skb); } @@ -1724,8 +1726,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; } @@ -1930,8 +1933,9 @@ void tcp_send_fin(struct sock *sk) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ TCP_SKB_CB(skb)->seq = tp->write_seq; @@ -1963,8 +1967,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* Send it off. */ TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); @@ -2047,8 +2052,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ @@ -2152,8 +2158,9 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; TCP_ECN_send_syn(sk, tp, buff); TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->tso_segs = 1; - skb_shinfo(buff)->tso_size = 0; + skb_shinfo(buff)->gso_segs = 1; + skb_shinfo(buff)->gso_size = 0; + skb_shinfo(buff)->gso_type = 0; buff->csum = 0; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; @@ -2257,8 +2264,9 @@ void tcp_send_ack(struct sock *sk) buff->csum = 0; TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->tso_segs = 1; - skb_shinfo(buff)->tso_size = 0; + skb_shinfo(buff)->gso_segs = 1; + skb_shinfo(buff)->gso_size = 0; + skb_shinfo(buff)->gso_type = 0; /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); @@ -2293,8 +2301,9 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) skb->csum = 0; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = urgent; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d29620f4910e..abb94de33768 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -148,7 +148,7 @@ static int ip6_output2(struct sk_buff *skb) int ip6_output(struct sk_buff *skb) { - if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) || + if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size) || dst_allfrag(skb->dst)) return ip6_fragment(skb, ip6_output2); else @@ -833,8 +833,9 @@ static inline int ip6_ufo_append_data(struct sock *sk, struct frag_hdr fhdr; /* specify the length of each IP datagram fragment*/ - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - - sizeof(struct frag_hdr); + skb_shinfo(skb)->gso_size = mtu - fragheaderlen - + sizeof(struct frag_hdr); + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4; ipv6_select_ident(skb, &fhdr); skb_shinfo(skb)->ip6_frag_id = fhdr.identification; __skb_queue_tail(&sk->sk_write_queue, skb); -- cgit v1.2.3 From f4c50d990dcf11a296679dc05de3873783236711 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 03:02:40 -0700 Subject: [NET]: Add software TSOv4 This patch adds the GSO implementation for IPv4 TCP. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/net/protocol.h | 1 + include/net/tcp.h | 2 + net/core/skbuff.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 51 ++++++++++++++++++++ net/ipv4/tcp.c | 62 ++++++++++++++++++++++++ 6 files changed, 243 insertions(+) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 97b0d2d1a6b0..a45bba9b8cbd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1297,6 +1297,7 @@ extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); extern void skb_release_data(struct sk_buff *skb); +extern struct sk_buff *skb_segment(struct sk_buff *skb, int sg); static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) diff --git a/include/net/protocol.h b/include/net/protocol.h index bcaee39bd2ff..3b6dc15c68a5 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -36,6 +36,7 @@ struct net_protocol { int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); + struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg); int no_policy; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index b197a9e615c1..ca3d38dfc00b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1086,6 +1086,8 @@ extern struct request_sock_ops tcp_request_sock_ops; extern int tcp_v4_destroy_sock(struct sock *sk); +extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg); + #ifdef CONFIG_PROC_FS extern int tcp4_proc_init(void); extern void tcp4_proc_exit(void); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 368d98578c14..8e5044ba3ab6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1842,6 +1842,132 @@ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) EXPORT_SYMBOL_GPL(skb_pull_rcsum); +/** + * skb_segment - Perform protocol segmentation on skb. + * @skb: buffer to segment + * @sg: whether scatter-gather can be used for generated segments + * + * This function performs segmentation on the given skb. It returns + * the segment at the given position. It returns NULL if there are + * no more segments to generate, or when an error is encountered. + */ +struct sk_buff *skb_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = NULL; + struct sk_buff *tail = NULL; + unsigned int mss = skb_shinfo(skb)->gso_size; + unsigned int doffset = skb->data - skb->mac.raw; + unsigned int offset = doffset; + unsigned int headroom; + unsigned int len; + int nfrags = skb_shinfo(skb)->nr_frags; + int err = -ENOMEM; + int i = 0; + int pos; + + __skb_push(skb, doffset); + headroom = skb_headroom(skb); + pos = skb_headlen(skb); + + do { + struct sk_buff *nskb; + skb_frag_t *frag; + int hsize, nsize; + int k; + int size; + + len = skb->len - offset; + if (len > mss) + len = mss; + + hsize = skb_headlen(skb) - offset; + if (hsize < 0) + hsize = 0; + nsize = hsize + doffset; + if (nsize > len + doffset || !sg) + nsize = len + doffset; + + nskb = alloc_skb(nsize + headroom, GFP_ATOMIC); + if (unlikely(!nskb)) + goto err; + + if (segs) + tail->next = nskb; + else + segs = nskb; + tail = nskb; + + nskb->dev = skb->dev; + nskb->priority = skb->priority; + nskb->protocol = skb->protocol; + nskb->dst = dst_clone(skb->dst); + memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); + nskb->pkt_type = skb->pkt_type; + nskb->mac_len = skb->mac_len; + + skb_reserve(nskb, headroom); + nskb->mac.raw = nskb->data; + nskb->nh.raw = nskb->data + skb->mac_len; + nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw); + memcpy(skb_put(nskb, doffset), skb->data, doffset); + + if (!sg) { + nskb->csum = skb_copy_and_csum_bits(skb, offset, + skb_put(nskb, len), + len, 0); + continue; + } + + frag = skb_shinfo(nskb)->frags; + k = 0; + + nskb->ip_summed = CHECKSUM_HW; + nskb->csum = skb->csum; + memcpy(skb_put(nskb, hsize), skb->data + offset, hsize); + + while (pos < offset + len) { + BUG_ON(i >= nfrags); + + *frag = skb_shinfo(skb)->frags[i]; + get_page(frag->page); + size = frag->size; + + if (pos < offset) { + frag->page_offset += offset - pos; + frag->size -= offset - pos; + } + + k++; + + if (pos + size <= offset + len) { + i++; + pos += size; + } else { + frag->size -= pos + size - (offset + len); + break; + } + + frag++; + } + + skb_shinfo(nskb)->nr_frags = k; + nskb->data_len = len - hsize; + nskb->len += nskb->data_len; + nskb->truesize += nskb->data_len; + } while ((offset += len) < skb->len); + + return segs; + +err: + while ((skb = segs)) { + segs = skb->next; + kfree(skb); + } + return ERR_PTR(err); +} + +EXPORT_SYMBOL_GPL(skb_segment); + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0a277453526b..461216b47948 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -68,6 +68,7 @@ */ #include +#include #include #include #include @@ -1096,6 +1097,54 @@ int inet_sk_rebuild_header(struct sock *sk) EXPORT_SYMBOL(inet_sk_rebuild_header); +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct iphdr *iph; + struct net_protocol *ops; + int proto; + int ihl; + int id; + + if (!pskb_may_pull(skb, sizeof(*iph))) + goto out; + + iph = skb->nh.iph; + ihl = iph->ihl * 4; + if (ihl < sizeof(*iph)) + goto out; + + if (!pskb_may_pull(skb, ihl)) + goto out; + + skb->h.raw = __skb_pull(skb, ihl); + iph = skb->nh.iph; + id = ntohs(iph->id); + proto = iph->protocol & (MAX_INET_PROTOS - 1); + segs = ERR_PTR(-EPROTONOSUPPORT); + + rcu_read_lock(); + ops = rcu_dereference(inet_protos[proto]); + if (ops && ops->gso_segment) + segs = ops->gso_segment(skb, sg); + rcu_read_unlock(); + + if (IS_ERR(segs)) + goto out; + + skb = segs; + do { + iph = skb->nh.iph; + iph->id = htons(id++); + iph->tot_len = htons(skb->len - skb->mac_len); + iph->check = 0; + iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); + } while ((skb = skb->next)); + +out: + return segs; +} + #ifdef CONFIG_IP_MULTICAST static struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1105,6 +1154,7 @@ static struct net_protocol igmp_protocol = { static struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, + .gso_segment = tcp_tso_segment, .no_policy = 1, }; @@ -1150,6 +1200,7 @@ static int ipv4_proc_init(void); static struct packet_type ip_packet_type = { .type = __constant_htons(ETH_P_IP), .func = ip_rcv, + .gso_segment = inet_gso_segment, }; static int __init inet_init(void) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 062dd1a0d8a8..0e029c4e2903 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -258,6 +258,7 @@ #include #include #include +#include #include #include @@ -2144,6 +2145,67 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct tcphdr *th; + unsigned thlen; + unsigned int seq; + unsigned int delta; + unsigned int oldlen; + unsigned int len; + + if (!pskb_may_pull(skb, sizeof(*th))) + goto out; + + th = skb->h.th; + thlen = th->doff * 4; + if (thlen < sizeof(*th)) + goto out; + + if (!pskb_may_pull(skb, thlen)) + goto out; + + oldlen = ~htonl(skb->len); + __skb_pull(skb, thlen); + + segs = skb_segment(skb, sg); + if (IS_ERR(segs)) + goto out; + + len = skb_shinfo(skb)->gso_size; + delta = csum_add(oldlen, htonl(thlen + len)); + + skb = segs; + th = skb->h.th; + seq = ntohl(th->seq); + + do { + th->fin = th->psh = 0; + + if (skb->ip_summed == CHECKSUM_NONE) { + th->check = csum_fold(csum_partial( + skb->h.raw, thlen, csum_add(skb->csum, delta))); + } + + seq += len; + skb = skb->next; + th = skb->h.th; + + th->seq = htonl(seq); + th->cwr = 0; + } while (skb->next); + + if (skb->ip_summed == CHECKSUM_NONE) { + delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw)); + th->check = csum_fold(csum_partial( + skb->h.raw, thlen, csum_add(skb->csum, delta))); + } + +out: + return segs; +} + extern void __skb_cb_too_small_for_tcp(int, int); extern struct tcp_congestion_ops tcp_reno; -- cgit v1.2.3 From 09b8f7a93efd4b2c4ef391e2fbf076f28c6d36d6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 03:08:03 -0700 Subject: [IPSEC]: Handle GSO packets This patch segments GSO packets received by the IPsec stack. This can happen when a NIC driver injects GSO packets into the stack which are then forwarded to another host. The primary application of this is going to be Xen where its backend driver may inject GSO packets into dom0. Of course this also can be used by other virtualisation schemes such as VMWare or UML since the tap device could be modified to inject GSO packets received through splice. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/xfrm4_output.c | 54 +++++++++++++++++++++++++++++++++++++++++-------- net/ipv6/xfrm6_output.c | 39 +++++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index ac9d91d4bb05..193363e22932 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -9,6 +9,8 @@ */ #include +#include +#include #include #include #include @@ -97,16 +99,10 @@ error_nolock: goto out_exit; } -static int xfrm4_output_finish(struct sk_buff *skb) +static int xfrm4_output_finish2(struct sk_buff *skb) { int err; -#ifdef CONFIG_NETFILTER - if (!skb->dst->xfrm) { - IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(skb); - } -#endif while (likely((err = xfrm4_output_one(skb)) == 0)) { nf_reset(skb); @@ -119,7 +115,7 @@ static int xfrm4_output_finish(struct sk_buff *skb) return dst_output(skb); err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL, - skb->dst->dev, xfrm4_output_finish); + skb->dst->dev, xfrm4_output_finish2); if (unlikely(err != 1)) break; } @@ -127,6 +123,48 @@ static int xfrm4_output_finish(struct sk_buff *skb) return err; } +static int xfrm4_output_finish(struct sk_buff *skb) +{ + struct sk_buff *segs; + +#ifdef CONFIG_NETFILTER + if (!skb->dst->xfrm) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + + if (!skb_shinfo(skb)->gso_size) + return xfrm4_output_finish2(skb); + + skb->protocol = htons(ETH_P_IP); + segs = skb_gso_segment(skb, 0); + kfree_skb(skb); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = xfrm4_output_finish2(segs); + + if (unlikely(err)) { + while ((segs = nskb)) { + nskb = segs->next; + segs->next = NULL; + kfree_skb(segs); + } + return err; + } + + segs = nskb; + } while (segs); + + return 0; +} + int xfrm4_output(struct sk_buff *skb) { return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev, diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 16e84254a252..48fccb1eca08 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -94,7 +94,7 @@ error_nolock: goto out_exit; } -static int xfrm6_output_finish(struct sk_buff *skb) +static int xfrm6_output_finish2(struct sk_buff *skb) { int err; @@ -110,7 +110,7 @@ static int xfrm6_output_finish(struct sk_buff *skb) return dst_output(skb); err = nf_hook(PF_INET6, NF_IP6_POST_ROUTING, &skb, NULL, - skb->dst->dev, xfrm6_output_finish); + skb->dst->dev, xfrm6_output_finish2); if (unlikely(err != 1)) break; } @@ -118,6 +118,41 @@ static int xfrm6_output_finish(struct sk_buff *skb) return err; } +static int xfrm6_output_finish(struct sk_buff *skb) +{ + struct sk_buff *segs; + + if (!skb_shinfo(skb)->gso_size) + return xfrm6_output_finish2(skb); + + skb->protocol = htons(ETH_P_IP); + segs = skb_gso_segment(skb, 0); + kfree_skb(skb); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = xfrm6_output_finish2(segs); + + if (unlikely(err)) { + while ((segs = nskb)) { + nskb = segs->next; + segs->next = NULL; + kfree_skb(segs); + } + return err; + } + + segs = nskb; + } while (segs); + + return 0; +} + int xfrm6_output(struct sk_buff *skb) { return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dst->dev, -- cgit v1.2.3 From fb1bb34d45400f12e0a33f8c487b3795674908a7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 25 Jun 2006 05:46:43 -0700 Subject: [PATCH] remove for_each_cpu() Convert a few stragglers over to for_each_possible_cpu(), remove for_each_cpu(). Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c | 6 +++--- arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c | 6 +++--- arch/ia64/kernel/topology.c | 2 +- arch/powerpc/platforms/cell/interrupt.c | 2 +- include/linux/cpumask.h | 1 - net/ipv4/netfilter/ip_tables.c | 2 +- 6 files changed, 9 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c index 05668e3598c0..5fd65325b81a 100644 --- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -371,11 +371,11 @@ static int acpi_cpufreq_early_init_acpi(void) dprintk("acpi_cpufreq_early_init\n"); - for_each_cpu(i) { + for_each_possible_cpu(i) { data = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL); if (!data) { - for_each_cpu(j) { + for_each_possible_cpu(j) { kfree(acpi_perf_data[j]); acpi_perf_data[j] = NULL; } @@ -584,7 +584,7 @@ acpi_cpufreq_exit (void) cpufreq_unregister_driver(&acpi_cpufreq_driver); - for_each_cpu(i) { + for_each_possible_cpu(i) { kfree(acpi_perf_data[i]); acpi_perf_data[i] = NULL; } diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c index 31c3a5baaa7f..f7e4356f6820 100644 --- a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c @@ -361,11 +361,11 @@ static int centrino_cpu_early_init_acpi(void) unsigned int i, j; struct acpi_processor_performance *data; - for_each_cpu(i) { + for_each_possible_cpu(i) { data = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL); if (!data) { - for_each_cpu(j) { + for_each_possible_cpu(j) { kfree(acpi_perf_data[j]); acpi_perf_data[j] = NULL; } @@ -805,7 +805,7 @@ static void __exit centrino_exit(void) cpufreq_unregister_driver(¢rino_driver); #ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI - for_each_cpu(j) { + for_each_possible_cpu(j) { kfree(acpi_perf_data[j]); acpi_perf_data[j] = NULL; } diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 4f3a16b37f8f..879edb51d1e0 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -166,7 +166,7 @@ static void cache_shared_cpu_map_setup( unsigned int cpu, num_shared = (int) csi.num_shared; do { - for_each_cpu(j) + for_each_possible_cpu(j) if (cpu_data(cpu)->socket_id == cpu_data(j)->socket_id && cpu_data(j)->core_id == csi.log1_cid && cpu_data(j)->thread_id == csi.log1_tid) diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c index f4e2d8805c9e..1bbf822b4efc 100644 --- a/arch/powerpc/platforms/cell/interrupt.c +++ b/arch/powerpc/platforms/cell/interrupt.c @@ -180,7 +180,7 @@ static int setup_iic_hardcoded(void) unsigned long regs; struct iic *iic; - for_each_cpu(cpu) { + for_each_possible_cpu(cpu) { iic = &per_cpu(iic, cpu); nodeid = cpu/2; diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 9cbb781d6f80..fb5b761e3444 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -405,7 +405,6 @@ int __any_online_cpu(const cpumask_t *mask); #define any_online_cpu(mask) 0 #endif -#define for_each_cpu(cpu) for_each_cpu_mask((cpu), cpu_possible_map) #define for_each_possible_cpu(cpu) for_each_cpu_mask((cpu), cpu_possible_map) #define for_each_online_cpu(cpu) for_each_cpu_mask((cpu), cpu_online_map) #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index cee3397ec277..706c0025ec5e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1761,7 +1761,7 @@ translate_compat_table(const char *name, goto free_newinfo; /* And one copy for every other CPU */ - for_each_cpu(i) + for_each_possible_cpu(i) if (newinfo->entries[i] && newinfo->entries[i] != entry1) memcpy(newinfo->entries[i], entry1, newinfo->size); -- cgit v1.2.3 From bfe5d834195b3089b8846577311340376cc0f450 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 25 Jun 2006 05:47:14 -0700 Subject: [PATCH] Define __raw_get_cpu_var and use it There are several instances of per_cpu(foo, raw_smp_processor_id()), which is semantically equivalent to __get_cpu_var(foo) but without the warning that smp_processor_id() can give if CONFIG_DEBUG_PREEMPT is enabled. For those architectures with optimized per-cpu implementations, namely ia64, powerpc, s390, sparc64 and x86_64, per_cpu() turns into more and slower code than __get_cpu_var(), so it would be preferable to use __get_cpu_var on those platforms. This defines a __raw_get_cpu_var(x) macro which turns into per_cpu(x, raw_smp_processor_id()) on architectures that use the generic per-cpu implementation, and turns into __get_cpu_var(x) on the architectures that have an optimized per-cpu implementation. Signed-off-by: Paul Mackerras Acked-by: David S. Miller Acked-by: Ingo Molnar Acked-by: Martin Schwidefsky Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/percpu.h | 2 ++ include/asm-ia64/percpu.h | 2 ++ include/asm-powerpc/percpu.h | 2 ++ include/asm-s390/percpu.h | 2 ++ include/asm-sparc64/percpu.h | 2 ++ include/asm-x86_64/percpu.h | 2 ++ kernel/hrtimer.c | 4 ++-- kernel/sched.c | 4 ++-- kernel/softlockup.c | 2 +- kernel/timer.c | 2 +- net/ipv4/route.c | 2 +- 11 files changed, 19 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index c0caf433a7d7..c74521157461 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -14,6 +14,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) #define __get_cpu_var(var) per_cpu(var, smp_processor_id()) +#define __raw_get_cpu_var(var) per_cpu(var, raw_smp_processor_id()) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -30,6 +31,7 @@ do { \ #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ diff --git a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h index ae357d504fba..24d898b650c5 100644 --- a/include/asm-ia64/percpu.h +++ b/include/asm-ia64/percpu.h @@ -42,6 +42,7 @@ DECLARE_PER_CPU(unsigned long, local_per_cpu_offset); #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size); extern void setup_per_cpu_areas (void); @@ -51,6 +52,7 @@ extern void *per_cpu_init(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #define per_cpu_init() (__phys_per_cpu_start) #endif /* SMP */ diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h index 184a7a4d2fdf..faa1fc703053 100644 --- a/include/asm-powerpc/percpu.h +++ b/include/asm-powerpc/percpu.h @@ -22,6 +22,7 @@ /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -41,6 +42,7 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ diff --git a/include/asm-s390/percpu.h b/include/asm-s390/percpu.h index 436d216601e5..d9a8cca9b653 100644 --- a/include/asm-s390/percpu.h +++ b/include/asm-s390/percpu.h @@ -40,6 +40,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; __typeof__(type) per_cpu__##name #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) +#define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu]) /* A macro to avoid #include hell... */ @@ -57,6 +58,7 @@ do { \ __typeof__(type) per_cpu__##name #define __get_cpu_var(var) __reloc_hide(var,0) +#define __raw_get_cpu_var(var) __reloc_hide(var,0) #define per_cpu(var,cpu) __reloc_hide(var,0) #endif /* SMP */ diff --git a/include/asm-sparc64/percpu.h b/include/asm-sparc64/percpu.h index baef13b58952..a6ece06b83db 100644 --- a/include/asm-sparc64/percpu.h +++ b/include/asm-sparc64/percpu.h @@ -21,6 +21,7 @@ register unsigned long __local_per_cpu_offset asm("g5"); /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset)) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset)) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -37,6 +38,7 @@ do { \ #define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h index 7f33aaf9f7b1..549eb929b2c0 100644 --- a/include/asm-x86_64/percpu.h +++ b/include/asm-x86_64/percpu.h @@ -21,6 +21,7 @@ /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -40,6 +41,7 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 18324305724a..9587aac72f4d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -576,7 +576,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, memset(timer, 0, sizeof(struct hrtimer)); - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) clock_id = CLOCK_MONOTONIC; @@ -599,7 +599,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { struct hrtimer_base *bases; - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); *tp = ktime_to_timespec(bases[which_clock].resolution); return 0; diff --git a/kernel/sched.c b/kernel/sched.c index 5dbc42694477..f8d540b324ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4152,7 +4152,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); atomic_inc(&rq->nr_iowait); schedule(); @@ -4163,7 +4163,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); long ret; atomic_inc(&rq->nr_iowait); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 14c7faf02909..2c1be1163edc 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -36,7 +36,7 @@ static struct notifier_block panic_block = { void touch_softlockup_watchdog(void) { - per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; + __raw_get_cpu_var(touch_timestamp) = jiffies; } EXPORT_SYMBOL(touch_softlockup_watchdog); diff --git a/kernel/timer.c b/kernel/timer.c index f35b3939e937..eb97371b87d8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; - timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); + timer->base = __raw_get_cpu_var(tvec_bases); } EXPORT_SYMBOL(init_timer); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cc9423de7311..60b11aece5c3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -244,7 +244,7 @@ static unsigned int rt_hash_rnd; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ - (per_cpu(rt_cache_stat, raw_smp_processor_id()).field++) + (__raw_get_cpu_var(rt_cache_stat).field++) static int rt_intern_hash(unsigned hash, struct rtable *rth, struct rtable **res); -- cgit v1.2.3