summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorSimon Horman <horms@verge.net.au>2008-10-06 23:40:11 +0200
committerSimon Horman <horms@verge.net.au>2008-10-06 23:40:11 +0200
commita5e8546a8bff5d2047adc279df5753c44ba7b1a1 (patch)
treed9ca91f74d8279adbb1d3e942cc7ab145780ee29 /net/ipv4
parentIPVS: Move IPVS to net/netfilter/ipvs (diff)
parenttcp: Respect SO_RCVLOWAT in tcp_poll(). (diff)
downloadlinux-a5e8546a8bff5d2047adc279df5753c44ba7b1a1.tar.xz
linux-a5e8546a8bff5d2047adc279df5753c44ba7b1a1.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6 into lvs-next-2.6
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/inet_connection_sock.c3
-rw-r--r--net/ipv4/inet_timewait_sock.c1
-rw-r--r--net/ipv4/ip_output.c4
-rw-r--r--net/ipv4/ip_sockglue.c15
-rw-r--r--net/ipv4/netfilter.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c1
-rw-r--r--net/ipv4/route.c20
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/tcp.c12
-rw-r--r--net/ipv4/tcp_input.c6
-rw-r--r--net/ipv4/tcp_ipv4.c14
-rw-r--r--net/ipv4/tcp_output.c4
-rw-r--r--net/ipv4/udp.c69
14 files changed, 106 insertions, 51 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8a3ac1fa71a9..1fbff5fa4241 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -469,7 +469,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
*/
err = -EADDRNOTAVAIL;
if (!sysctl_ip_nonlocal_bind &&
- !inet->freebind &&
+ !(inet->freebind || inet->transparent) &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 0c1ae68ee84b..21fcc5a9045f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -335,6 +335,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
.saddr = ireq->loc_addr,
.tos = RT_CONN_FLAGS(sk) } },
.proto = sk->sk_protocol,
+ .flags = inet_sk_flowi_flags(sk),
.uli_u = { .ports =
{ .sport = inet_sk(sk)->sport,
.dport = ireq->rmt_port } } };
@@ -515,6 +516,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
newicsk->icsk_bind_hash = NULL;
inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
+ inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port);
+ inet_sk(newsk)->sport = inet_rsk(req)->loc_port;
newsk->sk_write_space = sk_stream_write_space;
newicsk->icsk_retransmits = 0;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 743f011b9a84..1c5fd38f8824 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -126,6 +126,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
tw->tw_reuse = sk->sk_reuse;
tw->tw_hash = sk->sk_hash;
tw->tw_ipv6only = 0;
+ tw->tw_transparent = inet->transparent;
tw->tw_prot = sk->sk_prot_creator;
twsk_net_set(tw, hold_net(sock_net(sk)));
atomic_set(&tw->tw_refcnt, 1);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d533a89e08de..d2a8f8bb78a6 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -340,6 +340,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
.saddr = inet->saddr,
.tos = RT_CONN_FLAGS(sk) } },
.proto = sk->sk_protocol,
+ .flags = inet_sk_flowi_flags(sk),
.uli_u = { .ports =
{ .sport = inet->sport,
.dport = inet->dport } } };
@@ -1371,7 +1372,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
.uli_u = { .ports =
{ .sport = tcp_hdr(skb)->dest,
.dport = tcp_hdr(skb)->source } },
- .proto = sk->sk_protocol };
+ .proto = sk->sk_protocol,
+ .flags = ip_reply_arg_flowi_flags(arg) };
security_skb_classify_flow(skb, &fl);
if (ip_route_output_key(sock_net(sk), &rt, &fl))
return;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 105d92a039b9..465abf0a9869 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -419,7 +419,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
(1<<IP_TTL) | (1<<IP_HDRINCL) |
(1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
(1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
- (1<<IP_PASSSEC))) ||
+ (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) ||
optname == IP_MULTICAST_TTL ||
optname == IP_MULTICAST_LOOP) {
if (optlen >= sizeof(int)) {
@@ -878,6 +878,16 @@ static int do_ip_setsockopt(struct sock *sk, int level,
err = xfrm_user_policy(sk, optname, optval, optlen);
break;
+ case IP_TRANSPARENT:
+ if (!capable(CAP_NET_ADMIN)) {
+ err = -EPERM;
+ break;
+ }
+ if (optlen < 1)
+ goto e_inval;
+ inet->transparent = !!val;
+ break;
+
default:
err = -ENOPROTOOPT;
break;
@@ -1130,6 +1140,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_FREEBIND:
val = inet->freebind;
break;
+ case IP_TRANSPARENT:
+ val = inet->transparent;
+ break;
default:
release_sock(sk);
return -ENOPROTOOPT;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index f8edacdf991d..01671ad51ed3 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -20,6 +20,8 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
unsigned int type;
type = inet_addr_type(&init_net, iph->saddr);
+ if (skb->sk && inet_sk(skb->sk)->transparent)
+ type = RTN_LOCAL;
if (addr_type == RTN_UNSPEC)
addr_type = type;
@@ -33,6 +35,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
fl.mark = skb->mark;
+ fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
if (ip_route_output_key(&init_net, &rt, &fl) != 0)
return -1;
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 11976ea29884..112dcfa12900 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -16,6 +16,7 @@
#include <linux/udp.h>
#include <net/checksum.h>
#include <net/tcp.h>
+#include <net/route.h>
#include <linux/netfilter_ipv4.h>
#include <net/netfilter/nf_conntrack.h>
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f62187bb6d08..a6d7c584f53b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2361,11 +2361,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
ipv4_is_zeronet(oldflp->fl4_src))
goto out;
- /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = ip_dev_find(net, oldflp->fl4_src);
- if (dev_out == NULL)
- goto out;
-
/* I removed check for oif == dev_out->oif here.
It was wrong for two reasons:
1. ip_dev_find(net, saddr) can return wrong iface, if saddr
@@ -2377,6 +2372,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
if (oldflp->oif == 0
&& (ipv4_is_multicast(oldflp->fl4_dst) ||
oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+ /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+ dev_out = ip_dev_find(net, oldflp->fl4_src);
+ if (dev_out == NULL)
+ goto out;
+
/* Special hack: user can direct multicasts
and limited broadcast via necessary interface
without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
@@ -2395,9 +2395,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
fl.oif = dev_out->ifindex;
goto make_route;
}
- if (dev_out)
+
+ if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
+ /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+ dev_out = ip_dev_find(net, oldflp->fl4_src);
+ if (dev_out == NULL)
+ goto out;
dev_put(dev_out);
- dev_out = NULL;
+ dev_out = NULL;
+ }
}
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 9d38005abbac..d346c22aa6ae 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -16,6 +16,7 @@
#include <linux/cryptohash.h>
#include <linux/kernel.h>
#include <net/tcp.h>
+#include <net/route.h>
/* Timestamps: lowest 9 bits store TCP options */
#define TSBITS 9
@@ -296,6 +297,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
req->mss = mss;
+ ireq->loc_port = th->dest;
ireq->rmt_port = th->source;
ireq->loc_addr = ip_hdr(skb)->daddr;
ireq->rmt_addr = ip_hdr(skb)->saddr;
@@ -337,6 +339,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
.saddr = ireq->loc_addr,
.tos = RT_CONN_FLAGS(sk) } },
.proto = IPPROTO_TCP,
+ .flags = inet_sk_flowi_flags(sk),
.uli_u = { .ports =
{ .sport = th->dest,
.dport = th->source } } };
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1ab341e5d3e0..7d81a1ee5507 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -384,13 +384,17 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
/* Connected? */
if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ int target = sock_rcvlowat(sk, 0, INT_MAX);
+
+ if (tp->urg_seq == tp->copied_seq &&
+ !sock_flag(sk, SOCK_URGINLINE) &&
+ tp->urg_data)
+ target--;
+
/* Potential race condition. If read of tp below will
* escape above sk->sk_state, we can be illegally awaken
* in SYN_* states. */
- if ((tp->rcv_nxt != tp->copied_seq) &&
- (tp->urg_seq != tp->copied_seq ||
- tp->rcv_nxt != tp->copied_seq + 1 ||
- sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
+ if (tp->rcv_nxt - tp->copied_seq >= target)
mask |= POLLIN | POLLRDNORM;
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 85627f83665f..3b76bce769dd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1746,6 +1746,8 @@ int tcp_use_frto(struct sock *sk)
return 0;
skb = tcp_write_queue_head(sk);
+ if (tcp_skb_is_last(sk, skb))
+ return 1;
skb = tcp_write_queue_next(sk, skb); /* Skips head */
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
@@ -4156,7 +4158,7 @@ drop:
skb1 = skb1->prev;
}
}
- __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue);
+ __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
/* And clean segments covered by new one as whole. */
while ((skb1 = skb->next) !=
@@ -4254,7 +4256,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
memcpy(nskb->head, skb->head, header);
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
- __skb_insert(nskb, skb->prev, skb, list);
+ __skb_queue_before(list, skb, nskb);
skb_set_owner_r(nskb, sk);
/* Copy data, releasing collapsed skbs. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 44aef1c1f373..8b24bd833cb4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -591,6 +591,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
ip_hdr(skb)->saddr, /* XXX */
sizeof(struct tcphdr), IPPROTO_TCP, 0);
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+ arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
net = dev_net(skb->dst->dev);
ip_send_reply(net->ipv4.tcp_sock, skb,
@@ -606,7 +607,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 ts, int oif,
- struct tcp_md5sig_key *key)
+ struct tcp_md5sig_key *key,
+ int reply_flags)
{
struct tcphdr *th = tcp_hdr(skb);
struct {
@@ -618,7 +620,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
];
} rep;
struct ip_reply_arg arg;
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net(skb->dst->dev);
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -659,6 +661,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
ip_hdr(skb)->daddr, &rep.th);
}
#endif
+ arg.flags = reply_flags;
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr, /* XXX */
arg.iov[0].iov_len, IPPROTO_TCP, 0);
@@ -681,7 +684,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcptw->tw_ts_recent,
tw->tw_bound_dev_if,
- tcp_twsk_md5_key(tcptw)
+ tcp_twsk_md5_key(tcptw),
+ tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
);
inet_twsk_put(tw);
@@ -694,7 +698,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
req->ts_recent,
0,
- tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr));
+ tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
+ inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
}
/*
@@ -1244,6 +1249,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
ireq = inet_rsk(req);
ireq->loc_addr = daddr;
ireq->rmt_addr = saddr;
+ ireq->no_srccheck = inet_sk(sk)->transparent;
ireq->opt = tcp_v4_save_options(sk, skb);
if (!want_cookie)
TCP_ECN_create_request(req, tcp_hdr(skb));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c3d58ee3e16f..493553c71d32 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1932,8 +1932,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
/* Collapse two adjacent packets if worthwhile and we can. */
if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
(skb->len < (cur_mss >> 1)) &&
- (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
(!tcp_skb_is_last(sk, skb)) &&
+ (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
(skb_shinfo(skb)->nr_frags == 0 &&
skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
(tcp_skb_pcount(skb) == 1 &&
@@ -2275,7 +2275,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
th->syn = 1;
th->ack = 1;
TCP_ECN_make_synack(req, th);
- th->source = inet_sk(sk)->sport;
+ th->source = ireq->loc_port;
th->dest = ireq->rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
* not even correctly set)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8e42fbbd5761..c83d0ef469c9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -302,6 +302,13 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
return result;
}
+struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
+ __be32 daddr, __be16 dport, int dif)
+{
+ return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash);
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+
static inline struct sock *udp_v4_mcast_next(struct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
@@ -951,6 +958,27 @@ int udp_disconnect(struct sock *sk, int flags)
return 0;
}
+static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int is_udplite = IS_UDPLITE(sk);
+ int rc;
+
+ if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
+ /* Note that an ENOMEM error is charged twice */
+ if (rc == -ENOMEM)
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
+ goto drop;
+ }
+
+ return 0;
+
+drop:
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ kfree_skb(skb);
+ return -1;
+}
+
/* returns:
* -1: error
* 0: success
@@ -989,9 +1017,7 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
up->encap_rcv != NULL) {
int ret;
- bh_unlock_sock(sk);
ret = (*up->encap_rcv)(sk, skb);
- bh_lock_sock(sk);
if (ret <= 0) {
UDP_INC_STATS_BH(sock_net(sk),
UDP_MIB_INDATAGRAMS,
@@ -1044,17 +1070,16 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
goto drop;
}
- if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) {
- /* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM) {
- UDP_INC_STATS_BH(sock_net(sk),
- UDP_MIB_RCVBUFERRORS, is_udplite);
- atomic_inc(&sk->sk_drops);
- }
- goto drop;
- }
+ rc = 0;
- return 0;
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk))
+ rc = __udp_queue_rcv_skb(sk, skb);
+ else
+ sk_add_backlog(sk, skb);
+ bh_unlock_sock(sk);
+
+ return rc;
drop:
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
@@ -1092,15 +1117,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
skb1 = skb_clone(skb, GFP_ATOMIC);
if (skb1) {
- int ret = 0;
-
- bh_lock_sock(sk);
- if (!sock_owned_by_user(sk))
- ret = udp_queue_rcv_skb(sk, skb1);
- else
- sk_add_backlog(sk, skb1);
- bh_unlock_sock(sk);
-
+ int ret = udp_queue_rcv_skb(sk, skb1);
if (ret > 0)
/* we should probably re-process instead
* of dropping packets here. */
@@ -1195,13 +1212,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
uh->dest, inet_iif(skb), udptable);
if (sk != NULL) {
- int ret = 0;
- bh_lock_sock(sk);
- if (!sock_owned_by_user(sk))
- ret = udp_queue_rcv_skb(sk, skb);
- else
- sk_add_backlog(sk, skb);
- bh_unlock_sock(sk);
+ int ret = udp_queue_rcv_skb(sk, skb);
sock_put(sk);
/* a return value > 0 means to resubmit the input, but
@@ -1494,7 +1505,7 @@ struct proto udp_prot = {
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
- .backlog_rcv = udp_queue_rcv_skb,
+ .backlog_rcv = __udp_queue_rcv_skb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,