17 files changed, 847 insertions, 367 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 3e63123f7bbd..df5386885a90 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -3,7 +3,6 @@
 #
 config IP_MULTICAST
 	bool "IP: multicasting"
-	depends on INET
 	help
 	  This is code for addressing several networked computers at once,
 	  enlarging your kernel by about 2 KB. You need multicasting if you
@@ -17,7 +16,6 @@ config IP_MULTICAST
 
 config IP_ADVANCED_ROUTER
 	bool "IP: advanced router"
-	depends on INET
 	---help---
 	  If you intend to run your Linux box mostly as a router, i.e. as a
 	  computer that forwards and redistributes network packets, say Y; you
@@ -183,7 +181,6 @@ config IP_ROUTE_VERBOSE
 
 config IP_PNP
 	bool "IP: kernel level autoconfiguration"
-	depends on INET
 	help
 	  This enables automatic configuration of IP addresses of devices and
 	  of the routing table during kernel boot, based on either information
@@ -242,7 +239,6 @@ config IP_PNP_RARP
 #   bool '    IP: ARP support' CONFIG_IP_PNP_ARP		
 config NET_IPIP
 	tristate "IP: tunneling"
-	depends on INET
 	select INET_TUNNEL
 	---help---
 	  Tunneling means encapsulating data of one protocol type within
@@ -260,7 +256,6 @@ config NET_IPIP
 
 config NET_IPGRE
 	tristate "IP: GRE tunnels over IP"
-	depends on INET
 	select XFRM
 	help
 	  Tunneling means encapsulating data of one protocol type within
@@ -319,7 +314,7 @@ config IP_PIMSM_V2
 
 config ARPD
 	bool "IP: ARP daemon support (EXPERIMENTAL)"
-	depends on INET && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	---help---
 	  Normally, the kernel maintains an internal cache which maps IP
 	  addresses to hardware addresses on the local network, so that
@@ -344,7 +339,6 @@ config ARPD
 
 config SYN_COOKIES
 	bool "IP: TCP syncookie support (disabled per default)"
-	depends on INET
 	---help---
 	  Normal TCP/IP networking is open to an attack known as "SYN
 	  flooding". This denial-of-service attack prevents legitimate remote
@@ -381,7 +375,6 @@ config SYN_COOKIES
 
 config INET_AH
 	tristate "IP: AH transformation"
-	depends on INET
 	select XFRM
 	select CRYPTO
 	select CRYPTO_HMAC
@@ -394,7 +387,6 @@ config INET_AH
 
 config INET_ESP
 	tristate "IP: ESP transformation"
-	depends on INET
 	select XFRM
 	select CRYPTO
 	select CRYPTO_HMAC
@@ -408,7 +400,6 @@ config INET_ESP
 
 config INET_IPCOMP
 	tristate "IP: IPComp transformation"
-	depends on INET
 	select XFRM
 	select INET_TUNNEL
 	select CRYPTO
@@ -421,7 +412,6 @@ config INET_IPCOMP
 
 config INET_TUNNEL
 	tristate "IP: tunnel transformation"
-	depends on INET
 	select XFRM
 	---help---
 	  Support for generic IP tunnel transformation, which is required by
@@ -431,7 +421,6 @@ config INET_TUNNEL
 
 config IP_TCPDIAG
 	tristate "IP: TCP socket monitoring interface"
-	depends on INET
 	default y
 	---help---
 	  Support for TCP socket monitoring interface used by native Linux
@@ -447,7 +436,6 @@ config IP_TCPDIAG_IPV6
 
 config TCP_CONG_ADVANCED
 	bool "TCP: advanced congestion control"
-	depends on INET
 	---help---
 	  Support for selection of various TCP congestion control
 	  modules.
@@ -463,7 +451,6 @@ menu "TCP congestion control"
 
 config TCP_CONG_BIC
 	tristate "Binary Increase Congestion (BIC) control"
-	depends on INET
 	default y
 	---help---
 	BIC-TCP is a sender-side only change that ensures a linear RTT
@@ -478,7 +465,6 @@ config TCP_CONG_BIC
 
 config TCP_CONG_WESTWOOD
 	tristate "TCP Westwood+"
-	depends on INET
 	default m
 	---help---
 	TCP Westwood+ is a sender-side only modification of the TCP Reno
@@ -493,7 +479,6 @@ config TCP_CONG_WESTWOOD
 
 config TCP_CONG_HTCP
         tristate "H-TCP"
-	depends on INET
         default m
 	---help---
 	H-TCP is a send-side only modifications of the TCP Reno
@@ -505,7 +490,7 @@ config TCP_CONG_HTCP
 
 config TCP_CONG_HSTCP
 	tristate "High Speed TCP"
-	depends on INET && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	default n
 	---help---
 	Sally Floyd's High Speed TCP (RFC 3649) congestion control.
@@ -516,7 +501,7 @@ config TCP_CONG_HSTCP
 
 config TCP_CONG_HYBLA
 	tristate "TCP-Hybla congestion control algorithm"
-	depends on INET && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	default n
 	---help---
 	TCP-Hybla is a sender-side only change that eliminates penalization of
@@ -526,7 +511,7 @@ config TCP_CONG_HYBLA
 
 config TCP_CONG_VEGAS
 	tristate "TCP Vegas"
-	depends on INET && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	default n
 	---help---
 	TCP Vegas is a sender-side only change to TCP that anticipates
@@ -537,7 +522,7 @@ config TCP_CONG_VEGAS
 
 config TCP_CONG_SCALABLE
 	tristate "Scalable TCP"
-	depends on INET && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	default n
 	---help---
 	Scalable TCP is a sender-side only change to TCP which uses a
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 658e7977924d..ef7468376ae6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
 static int ipv4_proc_init(void);
 extern void ipfrag_init(void);
 
+/*
+ *	IP protocol layer initialiser
+ */
+
+static struct packet_type ip_packet_type = {
+	.type = __constant_htons(ETH_P_IP),
+	.func = ip_rcv,
+};
+
 static int __init inet_init(void)
 {
 	struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
 
 	ipfrag_init();
 
+	dev_add_pack(&ip_packet_type);
+
 	rc = 0;
 out:
 	return rc;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index b56e88edf1b3..4be234c7d8c3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
  *		2 of the License, or (at your option) any later version.
  */
 
-#define VERSION "0.324"
+#define VERSION "0.325"
 
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -136,6 +136,7 @@ struct trie_use_stats {
 	unsigned int semantic_match_passed;
 	unsigned int semantic_match_miss;
 	unsigned int null_node_hit;
+	unsigned int resize_node_skipped;
 };
 #endif
 
@@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
 static int tnode_child_length(struct tnode *tn);
 static struct node *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn);
-static struct tnode *halve(struct trie *t, struct tnode *tn);
+static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
+static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
 static void tnode_free(struct tnode *tn);
 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
@@ -358,11 +359,32 @@ static inline void free_leaf_info(struct leaf_info *li)
 	kfree(li);
 }
 
+static struct tnode *tnode_alloc(unsigned int size)
+{
+	if (size <= PAGE_SIZE) {
+		return kmalloc(size, GFP_KERNEL);
+	} else {
+		return (struct tnode *)
+		       __get_free_pages(GFP_KERNEL, get_order(size));
+	}
+}
+
+static void __tnode_free(struct tnode *tn)
+{
+	unsigned int size = sizeof(struct tnode) +
+	                    (1<<tn->bits) * sizeof(struct node *);
+
+	if (size <= PAGE_SIZE)
+		kfree(tn);
+	else
+		free_pages((unsigned long)tn, get_order(size));
+}
+
 static struct tnode* tnode_new(t_key key, int pos, int bits)
 {
 	int nchildren = 1<<bits;
 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
-	struct tnode *tn = kmalloc(sz,  GFP_KERNEL);
+	struct tnode *tn = tnode_alloc(sz);
 
 	if(tn)  {
 		memset(tn, 0, sz);
@@ -390,7 +412,7 @@ static void tnode_free(struct tnode *tn)
 			printk("FL %p \n", tn);
 	}
 	else if(IS_TNODE(tn)) { 
-		kfree(tn);
+		__tnode_free(tn);
 		if(trie_debug > 0 ) 
 			printk("FT %p \n", tn);
 	}
@@ -460,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
 static struct node *resize(struct trie *t, struct tnode *tn) 
 {
 	int i;
+	int err = 0;
 
  	if (!tn)
 		return NULL;
@@ -556,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	 */
 
 	check_tnode(tn);
-
+	
+	err = 0;
 	while ((tn->full_children > 0 &&
 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
 				inflate_threshold * tnode_child_length(tn))) {
 
-		tn = inflate(t, tn);
+		tn = inflate(t, tn, &err);
+
+		if(err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			t->stats.resize_node_skipped++;
+#endif
+			break;
+		}
 	}
 
 	check_tnode(tn);
@@ -570,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	 * Halve as long as the number of empty children in this
 	 * node is above threshold.
 	 */
+
+	err = 0;
 	while (tn->bits > 1 &&
 	       100 * (tnode_child_length(tn) - tn->empty_children) <
-	       halve_threshold * tnode_child_length(tn))
+	       halve_threshold * tnode_child_length(tn)) {
+
+		tn = halve(t, tn, &err);
+
+		if(err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			t->stats.resize_node_skipped++;
+#endif
+			break;
+		}
+	}
 
-		tn = halve(t, tn);
   
 	/* Only one child remains */
 
@@ -599,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	return (struct node *) tn;
 }
 
-static struct tnode *inflate(struct trie *t, struct tnode *tn)
+static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 {
 	struct tnode *inode;
 	struct tnode *oldtnode = tn;
@@ -611,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 
 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 
-	if (!tn)
-		trie_bug("tnode_new failed");
+	if (!tn) {
+		*err = -ENOMEM;
+		return oldtnode;
+	}
+
+	/*
+	 * Preallocate and store tnodes before the actual work so we 
+	 * don't get into an inconsistent state if memory allocation 
+	 * fails. In case of failure we return the oldnode and  inflate 
+	 * of tnode is ignored.
+	 */
+			
+	for(i = 0; i < olen; i++) {
+		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
+
+		if (inode &&
+		    IS_TNODE(inode) &&
+		    inode->pos == oldtnode->pos + oldtnode->bits &&
+		    inode->bits > 1) {
+			struct tnode *left, *right;
+
+			t_key m = TKEY_GET_MASK(inode->pos, 1);
+ 
+			left = tnode_new(inode->key&(~m), inode->pos + 1,
+					 inode->bits - 1);
+
+			if(!left) {
+				*err = -ENOMEM; 
+				break;
+			}
+			
+			right = tnode_new(inode->key|m, inode->pos + 1,
+					  inode->bits - 1);
+
+			if(!right) {
+				*err = -ENOMEM; 
+				break;
+			}
+
+			put_child(t, tn, 2*i, (struct node *) left);
+			put_child(t, tn, 2*i+1, (struct node *) right);
+		}
+	}
+
+	if(*err) {
+		int size = tnode_child_length(tn);
+		int j;
+
+		for(j = 0; j < size; j++) 
+			if( tn->child[j])
+				tnode_free((struct tnode *)tn->child[j]);
+
+		tnode_free(tn);
+		
+		*err = -ENOMEM;
+		return oldtnode;
+	}
 
 	for(i = 0; i < olen; i++) {
 		struct node *node = tnode_get_child(oldtnode, i);
@@ -625,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 
 		if(IS_LEAF(node) || ((struct tnode *) node)->pos >
 		   tn->pos + tn->bits - 1) {
-			if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
+			if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
 					     1) == 0)
 				put_child(t, tn, 2*i, node);
 			else
@@ -665,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 			 * the position (inode->pos)
 			 */
 
-			t_key m = TKEY_GET_MASK(inode->pos, 1);
- 
 			/* Use the old key, but set the new significant 
 			 *   bit to zero. 
 			 */
-			left = tnode_new(inode->key&(~m), inode->pos + 1,
-					 inode->bits - 1);
 
-			if(!left) 
-				trie_bug("tnode_new failed");
-			
-			
-			/* Use the old key, but set the new significant 
-			 * bit to one. 
-			 */
-			right = tnode_new(inode->key|m, inode->pos + 1,
-					  inode->bits - 1);
+			left = (struct tnode *) tnode_get_child(tn, 2*i);
+			put_child(t, tn, 2*i, NULL);
+
+			if(!left)
+				BUG();
+
+			right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+			put_child(t, tn, 2*i+1, NULL);
+
+			if(!right)
+				BUG();
 
-			if(!right) 
-				trie_bug("tnode_new failed");
-			
 			size = tnode_child_length(left);
 			for(j = 0; j < size; j++) {
 				put_child(t, left, j, inode->child[j]);
@@ -701,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
 	return tn;
 }
 
-static struct tnode *halve(struct trie *t, struct tnode *tn)
+static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 {
 	struct tnode *oldtnode = tn;
 	struct node *left, *right;
@@ -712,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
   
 	tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 
-	if(!tn) 
-		trie_bug("tnode_new failed");
+	if (!tn) {
+		*err = -ENOMEM;
+		return oldtnode;
+	}
+
+	/*
+	 * Preallocate and store tnodes before the actual work so we 
+	 * don't get into an inconsistent state if memory allocation 
+	 * fails. In case of failure we return the oldnode and halve 
+	 * of tnode is ignored.
+	 */
+
+	for(i = 0; i < olen; i += 2) {
+		left = tnode_get_child(oldtnode, i);
+		right = tnode_get_child(oldtnode, i+1);
+    
+		/* Two nonempty children */
+		if( left && right)  {
+			struct tnode *newBinNode =
+				tnode_new(left->key, tn->pos + tn->bits, 1);
+
+			if(!newBinNode) {
+				*err = -ENOMEM; 
+				break;
+			}
+			put_child(t, tn, i/2, (struct node *)newBinNode);
+		}
+	}
+
+	if(*err) {
+		int size = tnode_child_length(tn);
+		int j;
+
+		for(j = 0; j < size; j++) 
+			if( tn->child[j])
+				tnode_free((struct tnode *)tn->child[j]);
+
+		tnode_free(tn);
+		
+		*err = -ENOMEM;
+		return oldtnode;
+	}
 
 	for(i = 0; i < olen; i += 2) {
 		left = tnode_get_child(oldtnode, i);
@@ -730,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
 		/* Two nonempty children */
 		else {
 			struct tnode *newBinNode =
-				tnode_new(left->key, tn->pos + tn->bits, 1);
+				(struct tnode *) tnode_get_child(tn, i/2);
+			put_child(t, tn, i/2, NULL);
 
 			if(!newBinNode) 
-				trie_bug("tnode_new failed");
+				BUG();
 
 			put_child(t, newBinNode, 0, left);
 			put_child(t, newBinNode, 1, right);
@@ -2301,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
+	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
 #ifdef CLEAR_STATS
 	memset(&(t->stats), 0, sizeof(t->stats));
 #endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cb759484979d..279f57abfecb 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -970,7 +970,8 @@ int icmp_rcv(struct sk_buff *skb)
 		 *	RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
 		 *	  discarded if to broadcast/multicast.
 		 */
-		if (icmph->type == ICMP_ECHO &&
+		if ((icmph->type == ICMP_ECHO ||
+		     icmph->type == ICMP_TIMESTAMP) &&
 		    sysctl_icmp_echo_ignore_broadcasts) {
 			goto error;
 		}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1f3183168a90..5088f90835ae 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 {
 	int err;
 	u32 addr = imr->imr_multiaddr.s_addr;
-	struct ip_mc_socklist *iml, *i;
+	struct ip_mc_socklist *iml=NULL, *i;
 	struct in_device *in_dev;
 	struct inet_sock *inet = inet_sk(sk);
+	int ifindex;
 	int count = 0;
 
 	if (!MULTICAST(addr))
@@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 		goto done;
 	}
 
-	iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
-
 	err = -EADDRINUSE;
+	ifindex = imr->imr_ifindex;
 	for (i = inet->mc_list; i; i = i->next) {
-		if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
-			/* New style additions are reference counted */
-			if (imr->imr_address.s_addr == 0) {
-				i->count++;
-				err = 0;
-			}
+		if (i->multi.imr_multiaddr.s_addr == addr &&
+		    i->multi.imr_ifindex == ifindex)
 			goto done;
-		}
 		count++;
 	}
 	err = -ENOBUFS;
-	if (iml == NULL || count >= sysctl_igmp_max_memberships)
+	if (count >= sysctl_igmp_max_memberships)
+		goto done;
+	iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
+	if (iml == NULL)
 		goto done;
+
 	memcpy(&iml->multi, imr, sizeof(*imr));
 	iml->next = inet->mc_list;
-	iml->count = 1;
 	iml->sflist = NULL;
 	iml->sfmode = MCAST_EXCLUDE;
 	inet->mc_list = iml;
 	ip_mc_inc_group(in_dev, addr);
-	iml = NULL;
 	err = 0;
-
 done:
 	rtnl_shunlock();
-	if (iml)
-		sock_kfree_s(sk, iml, sizeof(*iml));
 	return err;
 }
 
@@ -1693,30 +1687,25 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct ip_mc_socklist *iml, **imlp;
+	struct in_device *in_dev;
+	u32 group = imr->imr_multiaddr.s_addr;
+	u32 ifindex;
 
 	rtnl_lock();
+	in_dev = ip_mc_find_dev(imr);
+	if (!in_dev) {
+		rtnl_unlock();
+		return -ENODEV;
+	}
+	ifindex = imr->imr_ifindex;
 	for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
-		if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
-		    iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
-		    (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
-			struct in_device *in_dev;
-
-			in_dev = inetdev_by_index(iml->multi.imr_ifindex);
-			if (in_dev)
-				(void) ip_mc_leave_src(sk, iml, in_dev);
-			if (--iml->count) {
-				rtnl_unlock();
-				if (in_dev)
-					in_dev_put(in_dev);
-				return 0;
-			}
+		if (iml->multi.imr_multiaddr.s_addr == group &&
+		    iml->multi.imr_ifindex == ifindex) {
+			(void) ip_mc_leave_src(sk, iml, in_dev);
 
 			*imlp = iml->next;
 
-			if (in_dev) {
-				ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
-				in_dev_put(in_dev);
-			}
+			ip_mc_dec_group(in_dev, group);
 			rtnl_unlock();
 			sock_kfree_s(sk, iml, sizeof(*iml));
 			return 0;
@@ -1736,6 +1725,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 	struct in_device *in_dev = NULL;
 	struct inet_sock *inet = inet_sk(sk);
 	struct ip_sf_socklist *psl;
+	int leavegroup = 0;
 	int i, j, rv;
 
 	if (!MULTICAST(addr))
@@ -1755,15 +1745,20 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 	err = -EADDRNOTAVAIL;
 
 	for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
-		if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0)
+		if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
+		    && pmc->multi.imr_ifindex == imr.imr_ifindex)
 			break;
 	}
-	if (!pmc)		/* must have a prior join */
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
 		goto done;
+	}
 	/* if a source filter was set, must be the same mode as before */
 	if (pmc->sflist) {
-		if (pmc->sfmode != omode)
+		if (pmc->sfmode != omode) {
+			err = -EINVAL;
 			goto done;
+		}
 	} else if (pmc->sfmode != omode) {
 		/* allow mode switches for empty-set filters */
 		ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
@@ -1775,7 +1770,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 	psl = pmc->sflist;
 	if (!add) {
 		if (!psl)
-			goto done;
+			goto done;	/* err = -EADDRNOTAVAIL */
 		rv = !0;
 		for (i=0; i<psl->sl_count; i++) {
 			rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
@@ -1784,7 +1779,13 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 				break;
 		}
 		if (rv)		/* source not found */
+			goto done;	/* err = -EADDRNOTAVAIL */
+
+		/* special case - (INCLUDE, empty) == LEAVE_GROUP */
+		if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+			leavegroup = 1;
 			goto done;
+		}
 
 		/* update the interface filter */
 		ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 
@@ -1842,18 +1843,21 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 		&mreqs->imr_sourceaddr, 1);
 done:
 	rtnl_shunlock();
+	if (leavegroup)
+		return ip_mc_leave_group(sk, &imr);
 	return err;
 }
 
 int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 {
-	int err;
+	int err = 0;
 	struct ip_mreqn	imr;
 	u32 addr = msf->imsf_multiaddr;
 	struct ip_mc_socklist *pmc;
 	struct in_device *in_dev;
 	struct inet_sock *inet = inet_sk(sk);
 	struct ip_sf_socklist *newpsl, *psl;
+	int leavegroup = 0;
 
 	if (!MULTICAST(addr))
 		return -EINVAL;
@@ -1872,15 +1876,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 		err = -ENODEV;
 		goto done;
 	}
-	err = -EADDRNOTAVAIL;
+
+	/* special case - (INCLUDE, empty) == LEAVE_GROUP */
+	if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
+		leavegroup = 1;
+		goto done;
+	}
 
 	for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
 		if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
 		    pmc->multi.imr_ifindex == imr.imr_ifindex)
 			break;
 	}
-	if (!pmc)		/* must have a prior join */
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
 		goto done;
+	}
 	if (msf->imsf_numsrc) {
 		newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
 				IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
@@ -1909,8 +1920,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 			0, NULL, 0);
 	pmc->sflist = newpsl;
 	pmc->sfmode = msf->imsf_fmode;
+	err = 0;
 done:
 	rtnl_shunlock();
+	if (leavegroup)
+		err = ip_mc_leave_group(sk, &imr);
 	return err;
 }
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6ce5c3292f9f..80d13103b2b0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,7 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 	newskb->pkt_type = PACKET_LOOPBACK;
 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
 	BUG_TRAP(newskb->dst);
-	nf_reset(newskb);
 	netif_rx(newskb);
 	return 0;
 }
@@ -188,14 +187,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 		skb = skb2;
 	}
 
-#ifdef CONFIG_BRIDGE_NETFILTER
-	/* bridge-netfilter defers calling some IP hooks to the bridge layer
-	 * and still needs the conntrack reference.
-	 */
-	if (skb->nf_bridge == NULL)
-#endif
-		nf_reset(skb);
-
 	if (hh) {
 		int hh_alen;
 
@@ -389,7 +380,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->pkt_type = from->pkt_type;
 	to->priority = from->priority;
 	to->protocol = from->protocol;
-	to->security = from->security;
 	dst_release(to->dst);
 	to->dst = dst_clone(from->dst);
 	to->dev = from->dev;
@@ -1329,23 +1319,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 	ip_rt_put(rt);
 }
 
-/*
- *	IP protocol layer initialiser
- */
-
-static struct packet_type ip_packet_type = {
-	.type = __constant_htons(ETH_P_IP),
-	.func = ip_rcv,
-};
-
-/*
- *	IP registers the packet type and then calls the subprotocol initialisers
- */
-
 void __init ip_init(void)
 {
-	dev_add_pack(&ip_packet_type);
-
 	ip_rt_init();
 	inet_initpeers();
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f8b172f89811..fc7c481d0d79 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -677,11 +677,11 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 				mreq.imr_address.s_addr = mreqs.imr_interface;
 				mreq.imr_ifindex = 0;
 				err = ip_mc_join_group(sk, &mreq);
-				if (err)
+				if (err && err != -EADDRINUSE)
 					break;
 				omode = MCAST_INCLUDE;
 				add = 1;
-			} else /*IP_DROP_SOURCE_MEMBERSHIP */ {
+			} else /* IP_DROP_SOURCE_MEMBERSHIP */ {
 				omode = MCAST_INCLUDE;
 				add = 0;
 			}
@@ -754,7 +754,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 				mreq.imr_address.s_addr = 0;
 				mreq.imr_ifindex = greqs.gsr_interface;
 				err = ip_mc_join_group(sk, &mreq);
-				if (err)
+				if (err && err != -EADDRINUSE)
 					break;
 				greqs.gsr_interface = mreq.imr_ifindex;
 				omode = MCAST_INCLUDE;
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 63a82b4b64bb..c9820bfc493a 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -2,11 +2,11 @@
 # IP Virtual Server configuration
 #
 menu	"IP: Virtual Server Configuration"
-	depends on INET && NETFILTER
+	depends on NETFILTER
 
 config	IP_VS
 	tristate "IP virtual server support (EXPERIMENTAL)"
-	depends on INET && NETFILTER
+	depends on NETFILTER
 	---help---
 	  IP Virtual Server support will let you build a high-performance
 	  virtual server based on cluster of two or more real servers. This
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 9f16ab309106..d0145a8b1551 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -758,7 +758,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
 	return 1;
 }
 
-
+/* Called from keventd and must protect itself from softirqs */
 void ip_vs_random_dropentry(void)
 {
 	int idx;
@@ -773,7 +773,7 @@ void ip_vs_random_dropentry(void)
 		/*
 		 *  Lock is actually needed in this loop.
 		 */
-		ct_write_lock(hash);
+		ct_write_lock_bh(hash);
 
 		list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
 			if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
@@ -806,7 +806,7 @@ void ip_vs_random_dropentry(void)
 				ip_vs_conn_expire_now(cp->control);
 			}
 		}
-		ct_write_unlock(hash);
+		ct_write_unlock_bh(hash);
 	}
 }
 
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 12a82e91d22a..7d99ede2ef79 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -90,7 +90,8 @@ int ip_vs_get_debug_level(void)
 #endif
 
 /*
- *	update_defense_level is called from keventd and from sysctl.
+ *	update_defense_level is called from keventd and from sysctl,
+ *	so it needs to protect itself from softirqs
  */
 static void update_defense_level(void)
 {
@@ -110,6 +111,8 @@ static void update_defense_level(void)
 
 	nomem = (availmem < sysctl_ip_vs_amemthresh);
 
+	local_bh_disable();
+
 	/* drop_entry */
 	spin_lock(&__ip_vs_dropentry_lock);
 	switch (sysctl_ip_vs_drop_entry) {
@@ -206,6 +209,8 @@ static void update_defense_level(void)
 	if (to_change >= 0)
 		ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
 	write_unlock(&__ip_vs_securetcp_lock);
+
+	local_bh_enable();
 }
 
 
@@ -1360,9 +1365,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
 			/* Restore the correct value */
 			*valp = val;
 		} else {
-			local_bh_disable();
 			update_defense_level();
-			local_bh_enable();
 		}
 	}
 	return rc;
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 42dc95102873..1dd824f3cf0a 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -432,6 +432,13 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum,
 				        const struct net_device *out,
 				        int (*okfn)(struct sk_buff *))
 {
+#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
+	/* Previously seen (loopback)?  Ignore.  Do this before
+           fragment check. */
+	if ((*pskb)->nfct)
+		return NF_ACCEPT;
+#endif
+
 	/* Gather fragments. */
 	if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
 		*pskb = ip_ct_gather_frags(*pskb,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 12a1cf306f67..d675ff80b04d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
  *		Marc Boucher	:	routing by fwmark
  *	Robert Olsson		:	Added rt_cache statistics
  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
+ *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
 
 struct rt_hash_bucket {
 	struct rtable	*chain;
-	spinlock_t	lock;
-} __attribute__((__aligned__(8)));
+};
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ */
+#if NR_CPUS >= 32
+#define RT_HASH_LOCK_SZ	4096
+#elif NR_CPUS >= 16
+#define RT_HASH_LOCK_SZ	2048
+#elif NR_CPUS >= 8
+#define RT_HASH_LOCK_SZ	1024
+#elif NR_CPUS >= 4
+#define RT_HASH_LOCK_SZ	512
+#else
+#define RT_HASH_LOCK_SZ	256
+#endif
+
+static spinlock_t	*rt_hash_locks;
+# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
+# define rt_hash_lock_init()	{ \
+		int i; \
+		rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
+		if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
+		for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
+			spin_lock_init(&rt_hash_locks[i]); \
+		}
+#else
+# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_init()
+#endif
 
 static struct rt_hash_bucket 	*rt_hash_table;
 static unsigned			rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 /* This runs via a timer and thus is always in BH context. */
 static void rt_check_expire(unsigned long dummy)
 {
-	static int rover;
-	int i = rover, t;
+	static unsigned int rover;
+	unsigned int i = rover, goal;
 	struct rtable *rth, **rthp;
 	unsigned long now = jiffies;
-
-	for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
-	     t -= ip_rt_gc_timeout) {
+	u64 mult;
+
+	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+	if (ip_rt_gc_timeout > 1)
+		do_div(mult, ip_rt_gc_timeout);
+	goal = (unsigned int)mult;
+	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+	for (; goal > 0; goal--) {
 		unsigned long tmo = ip_rt_gc_timeout;
 
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
 
-		spin_lock(&rt_hash_table[i].lock);
+		if (*rthp == 0)
+			continue;
+		spin_lock(rt_hash_lock_addr(i));
 		while ((rth = *rthp) != NULL) {
 			if (rth->u.dst.expires) {
 				/* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
  			rt_free(rth);
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 		}
-		spin_unlock(&rt_hash_table[i].lock);
+		spin_unlock(rt_hash_lock_addr(i));
 
 		/* Fallback loop breaker. */
 		if (time_after(jiffies, now))
 			break;
 	}
 	rover = i;
-	mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 }
 
 /* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
 	get_random_bytes(&rt_hash_rnd, 4);
 
 	for (i = rt_hash_mask; i >= 0; i--) {
-		spin_lock_bh(&rt_hash_table[i].lock);
+		spin_lock_bh(rt_hash_lock_addr(i));
 		rth = rt_hash_table[i].chain;
 		if (rth)
 			rt_hash_table[i].chain = NULL;
-		spin_unlock_bh(&rt_hash_table[i].lock);
+		spin_unlock_bh(rt_hash_lock_addr(i));
 
 		for (; rth; rth = next) {
 			next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
 
 			k = (k + 1) & rt_hash_mask;
 			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(&rt_hash_table[k].lock);
+			spin_lock_bh(rt_hash_lock_addr(k));
 			while ((rth = *rthp) != NULL) {
 				if (!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
 				goal--;
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 			}
-			spin_unlock_bh(&rt_hash_table[k].lock);
+			spin_unlock_bh(rt_hash_lock_addr(k));
 			if (goal <= 0)
 				break;
 		}
@@ -882,7 +920,7 @@ restart:
 
 	rthp = &rt_hash_table[hash].chain;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 		if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
 			rth->u.dst.__use++;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.lastuse = now;
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
 			*rp = rth;
@@ -949,7 +987,7 @@ restart:
 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 		int err = arp_bind_neighbour(&rt->u.dst);
 		if (err) {
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			if (err != -ENOBUFS) {
 				rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
 	}
 #endif
 	rt_hash_table[hash].chain = rt;
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
 	return 0;
 }
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
 {
 	struct rtable **rthp;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
 	for (rthp = &rt_hash_table[hash].chain; *rthp;
 	     rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
 			rt_free(rt);
 			break;
 		}
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -1647,7 +1685,7 @@ static void ip_handle_martian_source(struct net_device *dev,
 		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
 			"%u.%u.%u.%u, on dev %s\n",
 			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
-		if (dev->hard_header_len) {
+		if (dev->hard_header_len && skb->mac.raw) {
 			int i;
 			unsigned char *p = skb->mac.raw;
 			printk(KERN_WARNING "ll header: ");
@@ -3073,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
 
 int __init ip_rt_init(void)
 {
-	int i, order, goal, rc = 0;
+	int rc = 0;
 
 	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
 			     (jiffies ^ (jiffies >> 7)));
 
 #ifdef CONFIG_NET_CLS_ROUTE
+	{
+	int order;
 	for (order = 0;
 	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
 		/* NOTHING */;
@@ -3086,6 +3126,7 @@ int __init ip_rt_init(void)
 	if (!ip_rt_acct)
 		panic("IP: failed to allocate ip_rt_acct\n");
 	memset(ip_rt_acct, 0, PAGE_SIZE << order);
+	}
 #endif
 
 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3096,36 +3137,19 @@ int __init ip_rt_init(void)
 	if (!ipv4_dst_ops.kmem_cachep)
 		panic("IP: failed to allocate ip_dst_cache\n");
 
-	goal = num_physpages >> (26 - PAGE_SHIFT);
-	if (rhash_entries)
-		goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
-	for (order = 0; (1UL << order) < goal; order++)
-		/* NOTHING */;
-
-	do {
-		rt_hash_mask = (1UL << order) * PAGE_SIZE /
-			sizeof(struct rt_hash_bucket);
-		while (rt_hash_mask & (rt_hash_mask - 1))
-			rt_hash_mask--;
-		rt_hash_table = (struct rt_hash_bucket *)
-			__get_free_pages(GFP_ATOMIC, order);
-	} while (rt_hash_table == NULL && --order > 0);
-
-	if (!rt_hash_table)
-		panic("Failed to allocate IP route cache hash table\n");
-
-	printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
-	       rt_hash_mask,
-	       (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
-
-	for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
-		/* NOTHING */;
-
-	rt_hash_mask--;
-	for (i = 0; i <= rt_hash_mask; i++) {
-		spin_lock_init(&rt_hash_table[i].lock);
-		rt_hash_table[i].chain = NULL;
-	}
+	rt_hash_table = (struct rt_hash_bucket *)
+		alloc_large_system_hash("IP route cache",
+					sizeof(struct rt_hash_bucket),
+					rhash_entries,
+					(num_physpages >= 128 * 1024) ?
+						(27 - PAGE_SHIFT) :
+						(29 - PAGE_SHIFT),
+					HASH_HIGHMEM,
+					&rt_hash_log,
+					&rt_hash_mask,
+					0);
+	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
+	rt_hash_lock_init();
 
 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 882436da9a3a..ddb6ce4ecff2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 			 size_t psize, int flags)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int mss_now;
+	int mss_now, size_goal;
 	int err;
 	ssize_t copied;
 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+	size_goal = tp->xmit_size_goal;
 	copied = 0;
 
 	err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 		int offset = poffset % PAGE_SIZE;
 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
 
-		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
+		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
 new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
 				goto wait_for_memory;
 
 			skb_entail(sk, tp, skb);
-			copy = mss_now;
+			copy = size_goal;
 		}
 
 		if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
 		if (!(psize -= copy))
 			goto out;
 
-		if (skb->len != mss_now || (flags & MSG_OOB))
+		if (skb->len < mss_now || (flags & MSG_OOB))
 			continue;
 
 		if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
 			goto do_error;
 
 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+		size_goal = tp->xmit_size_goal;
 	}
 
 out:
@@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 
 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 {
-	int tmp = tp->mss_cache_std;
+	int tmp = tp->mss_cache;
 
 	if (sk->sk_route_caps & NETIF_F_SG) {
-		int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+		if (sk->sk_route_caps & NETIF_F_TSO)
+			tmp = 0;
+		else {
+			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 
-		if (tmp >= pgbreak &&
-		    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
-			tmp = pgbreak;
+			if (tmp >= pgbreak &&
+			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+				tmp = pgbreak;
+		}
 	}
+
 	return tmp;
 }
 
@@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	int iovlen, flags;
-	int mss_now;
+	int mss_now, size_goal;
 	int err, copied;
 	long timeo;
 
@@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+	size_goal = tp->xmit_size_goal;
 
 	/* Ok commence sending. */
 	iovlen = msg->msg_iovlen;
@@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			skb = sk->sk_write_queue.prev;
 
 			if (!sk->sk_send_head ||
-			    (copy = mss_now - skb->len) <= 0) {
+			    (copy = size_goal - skb->len) <= 0) {
 
 new_segment:
 				/* Allocate new segment. If the interface is SG,
@@ -837,7 +845,7 @@ new_segment:
 					skb->ip_summed = CHECKSUM_HW;
 
 				skb_entail(sk, tp, skb);
-				copy = mss_now;
+				copy = size_goal;
 			}
 
 			/* Try to append data to the end of skb. */
@@ -872,11 +880,6 @@ new_segment:
 					tcp_mark_push(tp, skb);
 					goto new_segment;
 				} else if (page) {
-					/* If page is cached, align
-					 * offset to L1 cache boundary
-					 */
-					off = (off + L1_CACHE_BYTES - 1) &
-					      ~(L1_CACHE_BYTES - 1);
 					if (off == PAGE_SIZE) {
 						put_page(page);
 						TCP_PAGE(sk) = page = NULL;
@@ -937,7 +940,7 @@ new_segment:
 			if ((seglen -= copy) == 0 && iovlen == 0)
 				goto out;
 
-			if (skb->len != mss_now || (flags & MSG_OOB))
+			if (skb->len < mss_now || (flags & MSG_OOB))
 				continue;
 
 			if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
 				goto do_error;
 
 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+			size_goal = tp->xmit_size_goal;
 		}
 	}
 
@@ -1101,7 +1105,7 @@ static void tcp_prequeue_process(struct sock *sk)
 	struct sk_buff *skb;
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
+	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
 
 	/* RX process wants to run with disabled BHs, though it is not
 	 * necessary */
@@ -1365,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			 * is not empty. It is more elegant, but eats cycles,
 			 * unfortunately.
 			 */
-			if (skb_queue_len(&tp->ucopy.prequeue))
+			if (!skb_queue_empty(&tp->ucopy.prequeue))
 				goto do_prequeue;
 
 			/* __ Set realtime policy in scheduler __ */
@@ -1390,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			}
 
 			if (tp->rcv_nxt == tp->copied_seq &&
-			    skb_queue_len(&tp->ucopy.prequeue)) {
+			    !skb_queue_empty(&tp->ucopy.prequeue)) {
 do_prequeue:
 				tcp_prequeue_process(sk);
 
@@ -1472,7 +1476,7 @@ skip_copy:
 	} while (len > 0);
 
 	if (user_recv) {
-		if (skb_queue_len(&tp->ucopy.prequeue)) {
+		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 			int chunk;
 
 			tp->ucopy.len = copied > 0 ? len : 0;
@@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 	info->tcpi_rto = jiffies_to_usecs(tp->rto);
 	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
-	info->tcpi_snd_mss = tp->mss_cache_std;
+	info->tcpi_snd_mss = tp->mss_cache;
 	info->tcpi_rcv_mss = tp->ack.rcv_mss;
 
 	info->tcpi_unacked = tp->packets_out;
@@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 
 	switch (optname) {
 	case TCP_MAXSEG:
-		val = tp->mss_cache_std;
+		val = tp->mss_cache;
 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
 			val = tp->rx_opt.user_mss;
 		break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7bbbbc33eb4b..53a8a5399f1e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
 	if (!cwnd) {
-		if (tp->mss_cache_std > 1460)
+		if (tp->mss_cache > 1460)
 			cwnd = 2;
 		else
-			cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+			cwnd = (tp->mss_cache > 1095) ? 3 : 4;
 	}
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
@@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	if (sk->sk_route_caps & NETIF_F_TSO) {
 		sk->sk_route_caps &= ~NETIF_F_TSO;
 		sock_set_flag(sk, SOCK_NO_LARGESEND);
-		tp->mss_cache = tp->mss_cache_std;
+		tp->mss_cache = tp->mss_cache;
 	}
 
 	if (!tp->sacked_out)
@@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 			    (IsFack(tp) ||
 			     !before(lost_retrans,
 				     TCP_SKB_CB(skb)->ack_seq + tp->reordering *
-				     tp->mss_cache_std))) {
+				     tp->mss_cache))) {
 				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 				tp->retrans_out -= tcp_skb_pcount(skb);
 
@@ -1957,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
 	}
 }
 
-/* There is one downside to this scheme.  Although we keep the
- * ACK clock ticking, adjusting packet counters and advancing
- * congestion window, we do not liberate socket send buffer
- * space.
- *
- * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
- * then making a write space wakeup callback is a possible
- * future enhancement.  WARNING: it is not trivial to make.
- */
 static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
 			 __u32 now, __s32 *seq_rtt)
 {
@@ -2047,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
 		 * the other end.
 		 */
 		if (after(scb->end_seq, tp->snd_una)) {
-			if (tcp_skb_pcount(skb) > 1)
+			if (tcp_skb_pcount(skb) > 1 &&
+			    after(tp->snd_una, scb->seq))
 				acked |= tcp_tso_acked(sk, skb,
 						       now, &seq_rtt);
 			break;
@@ -2810,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	int this_sack;
 
 	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
-	if (skb_queue_len(&tp->out_of_order_queue) == 0) {
+	if (skb_queue_empty(&tp->out_of_order_queue)) {
 		tp->rx_opt.num_sacks = 0;
 		tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
 		return;
@@ -2943,13 +2935,13 @@ queue_and_out:
 		if(th->fin)
 			tcp_fin(skb, sk, th);
 
-		if (skb_queue_len(&tp->out_of_order_queue)) {
+		if (!skb_queue_empty(&tp->out_of_order_queue)) {
 			tcp_ofo_queue(sk);
 
 			/* RFC2581. 4.2. SHOULD send immediate ACK, when
 			 * gap in queue is filled.
 			 */
-			if (!skb_queue_len(&tp->out_of_order_queue))
+			if (skb_queue_empty(&tp->out_of_order_queue))
 				tp->ack.pingpong = 0;
 		}
 
@@ -3257,9 +3249,8 @@ static int tcp_prune_queue(struct sock *sk)
 	 * This must not ever occur. */
 
 	/* First, purge the out_of_order queue. */
-	if (skb_queue_len(&tp->out_of_order_queue)) {
-		NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 
-				 skb_queue_len(&tp->out_of_order_queue));
+	if (!skb_queue_empty(&tp->out_of_order_queue)) {
+		NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
 		__skb_queue_purge(&tp->out_of_order_queue);
 
 		/* Reset SACK state.  A conforming SACK implementation will
@@ -3308,6 +3299,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
+static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
+{
+	/* If the user specified a specific send buffer setting, do
+	 * not modify it.
+	 */
+	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+		return 0;
+
+	/* If we are under global TCP memory pressure, do not expand.  */
+	if (tcp_memory_pressure)
+		return 0;
+
+	/* If we are under soft global TCP memory pressure, do not expand.  */
+	if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+		return 0;
+
+	/* If we filled the congestion window, do not expand.  */
+	if (tp->packets_out >= tp->snd_cwnd)
+		return 0;
+
+	return 1;
+}
 
 /* When incoming ACK allowed to free some skb from write_queue,
  * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3319,11 +3332,8 @@ static void tcp_new_space(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tp->packets_out < tp->snd_cwnd &&
-	    !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
-	    !tcp_memory_pressure &&
-	    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
- 		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
+	if (tcp_should_expand_sndbuf(sk, tp)) {
+ 		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
 			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
 		    demanded = max_t(unsigned int, tp->snd_cwnd,
 						   tp->reordering + 1);
@@ -3346,22 +3356,9 @@ static inline void tcp_check_space(struct sock *sk)
 	}
 }
 
-static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
-	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
-	    tcp_write_xmit(sk, tp->nonagle))
-		tcp_check_probe_timer(sk, tp);
-}
-
-static __inline__ void tcp_data_snd_check(struct sock *sk)
+static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
 {
-	struct sk_buff *skb = sk->sk_send_head;
-
-	if (skb != NULL)
-		__tcp_data_snd_check(sk, skb);
+	tcp_push_pending_frames(sk, tp);
 	tcp_check_space(sk);
 }
 
@@ -3655,7 +3652,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				 */
 				tcp_ack(sk, skb, 0);
 				__kfree_skb(skb); 
-				tcp_data_snd_check(sk);
+				tcp_data_snd_check(sk, tp);
 				return 0;
 			} else { /* Header too small */
 				TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -3721,7 +3718,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
 				/* Well, only one small jumplet in fast path... */
 				tcp_ack(sk, skb, FLAG_DATA);
-				tcp_data_snd_check(sk);
+				tcp_data_snd_check(sk, tp);
 				if (!tcp_ack_scheduled(tp))
 					goto no_ack;
 			}
@@ -3799,7 +3796,7 @@ step5:
 	/* step 7: process the segment text */
 	tcp_data_queue(sk, skb);
 
-	tcp_data_snd_check(sk);
+	tcp_data_snd_check(sk, tp);
 	tcp_ack_snd_check(sk);
 	return 0;
 
@@ -4109,7 +4106,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		/* Do step6 onward by hand. */
 		tcp_urg(sk, skb, th);
 		__kfree_skb(skb);
-		tcp_data_snd_check(sk);
+		tcp_data_snd_check(sk, tp);
 		return 0;
 	}
 
@@ -4300,7 +4297,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 	/* tcp_data could move socket to TIME-WAIT */
 	if (sk->sk_state != TCP_CLOSE) {
-		tcp_data_snd_check(sk);
+		tcp_data_snd_check(sk, tp);
 		tcp_ack_snd_check(sk);
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf112347a97..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	 */
 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
 	tp->snd_cwnd_clamp = ~0;
-	tp->mss_cache_std = tp->mss_cache = 536;
+	tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
 	tp->ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0e17c244875c..e3f8ea1bfa9c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
  * will allow a single TSO frame to consume.  Building TSO frames
  * which are too large can cause TCP streams to be bursty.
  */
-int sysctl_tcp_tso_win_divisor = 8;
+int sysctl_tcp_tso_win_divisor = 3;
 
 static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
 				    struct sk_buff *skb)
@@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
 		tp->ack.pingpong = 1;
 }
 
-static __inline__ void tcp_event_ack_sent(struct sock *sk)
+static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	tcp_dec_quickack_mode(tp);
+	tcp_dec_quickack_mode(tp, pkts);
 	tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
 }
 
@@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 		tp->af_specific->send_check(sk, th, skb->len, skb);
 
 		if (tcb->flags & TCPCB_FLAG_ACK)
-			tcp_event_ack_sent(sk);
+			tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
 
 		if (skb->len != tcp_header_size)
 			tcp_event_data_sent(tp, skb, sk);
@@ -403,42 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 		sk->sk_send_head = skb;
 }
 
-static inline void tcp_tso_set_push(struct sk_buff *skb)
-{
-	/* Force push to be on for any TSO frames to workaround
-	 * problems with busted implementations like Mac OS-X that
-	 * hold off socket receive wakeups until push is seen.
-	 */
-	if (tcp_skb_pcount(skb) > 1)
-		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-}
-
-/* Send _single_ skb sitting at the send head. This function requires
- * true push pending frames to setup probe timer etc.
- */
-void tcp_push_one(struct sock *sk, unsigned cur_mss)
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb = sk->sk_send_head;
 
-	if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
-		/* Send it out now. */
-		TCP_SKB_CB(skb)->when = tcp_time_stamp;
-		tcp_tso_set_push(skb);
-		if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
-			sk->sk_send_head = NULL;
-			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-			tcp_packets_out_inc(sk, tp, skb);
-			return;
-		}
-	}
-}
-
-void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (skb->len <= tp->mss_cache_std ||
+	if (skb->len <= tp->mss_cache ||
 	    !(sk->sk_route_caps & NETIF_F_TSO)) {
 		/* Avoid the costly divide in the normal
 		 * non-TSO case.
@@ -448,10 +417,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
 	} else {
 		unsigned int factor;
 
-		factor = skb->len + (tp->mss_cache_std - 1);
-		factor /= tp->mss_cache_std;
+		factor = skb->len + (tp->mss_cache - 1);
+		factor /= tp->mss_cache;
 		skb_shinfo(skb)->tso_segs = factor;
-		skb_shinfo(skb)->tso_size = tp->mss_cache_std;
+		skb_shinfo(skb)->tso_size = tp->mss_cache;
 	}
 }
 
@@ -537,6 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 	}
 
 	/* Link BUFF into the send queue. */
+	skb_header_release(buff);
 	__skb_append(skb, buff);
 
 	return 0;
@@ -657,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 
 	/* And store cached results */
 	tp->pmtu_cookie = pmtu;
-	tp->mss_cache = tp->mss_cache_std = mss_now;
+	tp->mss_cache = mss_now;
 
 	return mss_now;
 }
@@ -669,57 +639,316 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
  * cannot be large. However, taking into account rare use of URG, this
  * is not a big flaw.
  */
-
-unsigned int tcp_current_mss(struct sock *sk, int large)
+unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
-	unsigned int do_large, mss_now;
+	u32 mss_now;
+	u16 xmit_size_goal;
+	int doing_tso = 0;
+
+	mss_now = tp->mss_cache;
+
+	if (large_allowed &&
+	    (sk->sk_route_caps & NETIF_F_TSO) &&
+	    !tp->urg_mode)
+		doing_tso = 1;
 
-	mss_now = tp->mss_cache_std;
 	if (dst) {
 		u32 mtu = dst_mtu(dst);
 		if (mtu != tp->pmtu_cookie)
 			mss_now = tcp_sync_mss(sk, mtu);
 	}
 
-	do_large = (large &&
-		    (sk->sk_route_caps & NETIF_F_TSO) &&
-		    !tp->urg_mode);
+	if (tp->rx_opt.eff_sacks)
+		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+			    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
 
-	if (do_large) {
-		unsigned int large_mss, factor, limit;
+	xmit_size_goal = mss_now;
 
-		large_mss = 65535 - tp->af_specific->net_header_len -
+	if (doing_tso) {
+		xmit_size_goal = 65535 -
+			tp->af_specific->net_header_len -
 			tp->ext_header_len - tp->tcp_header_len;
 
-		if (tp->max_window && large_mss > (tp->max_window>>1))
-			large_mss = max((tp->max_window>>1),
-					68U - tp->tcp_header_len);
+		if (tp->max_window &&
+		    (xmit_size_goal > (tp->max_window >> 1)))
+			xmit_size_goal = max((tp->max_window >> 1),
+					     68U - tp->tcp_header_len);
+
+		xmit_size_goal -= (xmit_size_goal % mss_now);
+	}
+	tp->xmit_size_goal = xmit_size_goal;
 
-		factor = large_mss / mss_now;
+	return mss_now;
+}
 
-		/* Always keep large mss multiple of real mss, but
-		 * do not exceed 1/tso_win_divisor of the congestion window
-		 * so we can keep the ACK clock ticking and minimize
-		 * bursting.
-		 */
-		limit = tp->snd_cwnd;
-		if (sysctl_tcp_tso_win_divisor)
-			limit /= sysctl_tcp_tso_win_divisor;
-		limit = max(1U, limit);
-		if (factor > limit)
-			factor = limit;
+/* Congestion window validation. (RFC2861) */
 
-		tp->mss_cache = mss_now * factor;
+static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
+{
+	__u32 packets_out = tp->packets_out;
+
+	if (packets_out >= tp->snd_cwnd) {
+		/* Network is feed fully. */
+		tp->snd_cwnd_used = 0;
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+	} else {
+		/* Network starves. */
+		if (tp->packets_out > tp->snd_cwnd_used)
+			tp->snd_cwnd_used = tp->packets_out;
 
-		mss_now = tp->mss_cache;
+		if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+			tcp_cwnd_application_limited(sk);
 	}
+}
 
-	if (tp->rx_opt.eff_sacks)
-		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
-			    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
-	return mss_now;
+static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
+{
+	u32 window, cwnd_len;
+
+	window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
+	cwnd_len = mss_now * cwnd;
+	return min(window, cwnd_len);
+}
+
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules?  If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	u32 in_flight, cwnd;
+
+	/* Don't be strict about the congestion window for the final FIN.  */
+	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+		return 1;
+
+	in_flight = tcp_packets_in_flight(tp);
+	cwnd = tp->snd_cwnd;
+	if (in_flight < cwnd)
+		return (cwnd - in_flight);
+
+	return 0;
+}
+
+/* This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+{
+	int tso_segs = tcp_skb_pcount(skb);
+
+	if (!tso_segs) {
+		tcp_set_skb_tso_segs(sk, skb);
+		tso_segs = tcp_skb_pcount(skb);
+	}
+	return tso_segs;
+}
+
+static inline int tcp_minshall_check(const struct tcp_sock *tp)
+{
+	return after(tp->snd_sml,tp->snd_una) &&
+		!after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Return 0, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized.
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_NODELAY was set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ *    With Minshall's modification: all sent small packets are ACKed.
+ */
+
+static inline int tcp_nagle_check(const struct tcp_sock *tp,
+				  const struct sk_buff *skb, 
+				  unsigned mss_now, int nonagle)
+{
+	return (skb->len < mss_now &&
+		((nonagle&TCP_NAGLE_CORK) ||
+		 (!nonagle &&
+		  tp->packets_out &&
+		  tcp_minshall_check(tp))));
+}
+
+/* Return non-zero if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
+				 unsigned int cur_mss, int nonagle)
+{
+	/* Nagle rule does not apply to frames, which sit in the middle of the
+	 * write_queue (they have no chances to get new data).
+	 *
+	 * This is implemented in the callers, where they modify the 'nonagle'
+	 * argument based upon the location of SKB in the send queue.
+	 */
+	if (nonagle & TCP_NAGLE_PUSH)
+		return 1;
+
+	/* Don't use the nagle rule for urgent data (or for the final FIN).  */
+	if (tp->urg_mode ||
+	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+		return 1;
+
+	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+		return 1;
+
+	return 0;
+}
+
+/* Does at least the first segment of SKB fit into the send window? */
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
+{
+	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+	if (skb->len > cur_mss)
+		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+
+	return !after(end_seq, tp->snd_una + tp->snd_wnd);
+}
+
+/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
+ * should be put on the wire right now.  If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+				 unsigned int cur_mss, int nonagle)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int cwnd_quota;
+
+	tcp_init_tso_segs(sk, skb);
+
+	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+		return 0;
+
+	cwnd_quota = tcp_cwnd_test(tp, skb);
+	if (cwnd_quota &&
+	    !tcp_snd_wnd_test(tp, skb, cur_mss))
+		cwnd_quota = 0;
+
+	return cwnd_quota;
+}
+
+static inline int tcp_skb_is_last(const struct sock *sk, 
+				  const struct sk_buff *skb)
+{
+	return skb->next == (struct sk_buff *)&sk->sk_write_queue;
+}
+
+int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
+{
+	struct sk_buff *skb = sk->sk_send_head;
+
+	return (skb &&
+		tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+			     (tcp_skb_is_last(sk, skb) ?
+			      TCP_NAGLE_PUSH :
+			      tp->nonagle)));
+}
+
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list.  It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation.  In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+{
+	struct sk_buff *buff;
+	int nlen = skb->len - len;
+	u16 flags;
+
+	/* All of a TSO frame must be composed of paged data.  */
+	BUG_ON(skb->len != skb->data_len);
+
+	buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
+	if (unlikely(buff == NULL))
+		return -ENOMEM;
+
+	buff->truesize = nlen;
+	skb->truesize -= nlen;
+
+	/* Correct the sequence numbers. */
+	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+	/* PSH and FIN should only be set in the second packet. */
+	flags = TCP_SKB_CB(skb)->flags;
+	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+	TCP_SKB_CB(buff)->flags = flags;
+
+	/* This packet was never sent out yet, so no SACK bits. */
+	TCP_SKB_CB(buff)->sacked = 0;
+
+	buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
+	skb_split(skb, buff, len);
+
+	/* Fix up tso_factor for both original and new SKB.  */
+	tcp_set_skb_tso_segs(sk, skb);
+	tcp_set_skb_tso_segs(sk, buff);
+
+	/* Link BUFF into the send queue. */
+	skb_header_release(buff);
+	__skb_append(skb, buff);
+
+	return 0;
+}
+
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do.  View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+{
+	u32 send_win, cong_win, limit, in_flight;
+
+	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+		return 0;
+
+	if (tp->ca_state != TCP_CA_Open)
+		return 0;
+
+	in_flight = tcp_packets_in_flight(tp);
+
+	BUG_ON(tcp_skb_pcount(skb) <= 1 ||
+	       (tp->snd_cwnd <= in_flight));
+
+	send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
+
+	/* From in_flight test above, we know that cwnd > in_flight.  */
+	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+
+	limit = min(send_win, cong_win);
+
+	/* If sk_send_head can be sent fully now, just do it.  */
+	if (skb->len <= limit)
+		return 0;
+
+	if (sysctl_tcp_tso_win_divisor) {
+		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+
+		/* If at least some fraction of a window is available,
+		 * just use it.
+		 */
+		chunk /= sysctl_tcp_tso_win_divisor;
+		if (limit >= chunk)
+			return 0;
+	} else {
+		/* Different approach, try not to defer past a single
+		 * ACK.  Receiver should ACK every other full sized
+		 * frame, so if we have space for more than 3 frames
+		 * then send now.
+		 */
+		if (limit > tcp_max_burst(tp) * tp->mss_cache)
+			return 0;
+	}
+
+	/* Ok, it looks like it is advisable to defer.  */
+	return 1;
 }
 
 /* This routine writes packets to the network.  It advances the
@@ -729,57 +958,158 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
  * Returns 1, if no segments are in flight and we have queued segments, but
  * cannot send anything now because of SWS or another problem.
  */
-int tcp_write_xmit(struct sock *sk, int nonagle)
+static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	unsigned int mss_now;
+	struct sk_buff *skb;
+	unsigned int tso_segs, sent_pkts;
+	int cwnd_quota;
 
 	/* If we are closed, the bytes will have to remain here.
 	 * In time closedown will finish, we empty the write queue and all
 	 * will be happy.
 	 */
-	if (sk->sk_state != TCP_CLOSE) {
-		struct sk_buff *skb;
-		int sent_pkts = 0;
+	if (unlikely(sk->sk_state == TCP_CLOSE))
+		return 0;
+
+	skb = sk->sk_send_head;
+	if (unlikely(!skb))
+		return 0;
+
+	tso_segs = tcp_init_tso_segs(sk, skb);
+	cwnd_quota = tcp_cwnd_test(tp, skb);
+	if (unlikely(!cwnd_quota))
+		goto out;
+
+	sent_pkts = 0;
+	while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
+		BUG_ON(!tso_segs);
+
+		if (tso_segs == 1) {
+			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+						     (tcp_skb_is_last(sk, skb) ?
+						      nonagle : TCP_NAGLE_PUSH))))
+				break;
+		} else {
+			if (tcp_tso_should_defer(sk, tp, skb))
+				break;
+		}
 
-		/* Account for SACKS, we may need to fragment due to this.
-		 * It is just like the real MSS changing on us midstream.
-		 * We also handle things correctly when the user adds some
-		 * IP options mid-stream.  Silly to do, but cover it.
-		 */
-		mss_now = tcp_current_mss(sk, 1);
-
-		while ((skb = sk->sk_send_head) &&
-		       tcp_snd_test(sk, skb, mss_now,
-			       	    tcp_skb_is_last(sk, skb) ? nonagle :
-				    			       TCP_NAGLE_PUSH)) {
-			if (skb->len > mss_now) {
-				if (tcp_fragment(sk, skb, mss_now))
+		if (tso_segs > 1) {
+			u32 limit = tcp_window_allows(tp, skb,
+						      mss_now, cwnd_quota);
+
+			if (skb->len < limit) {
+				unsigned int trim = skb->len % mss_now;
+
+				if (trim)
+					limit = skb->len - trim;
+			}
+			if (skb->len > limit) {
+				if (tso_fragment(sk, skb, limit))
 					break;
 			}
-
-			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			tcp_tso_set_push(skb);
-			if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+		} else if (unlikely(skb->len > mss_now)) {
+			if (unlikely(tcp_fragment(sk, skb,  mss_now)))
 				break;
+		}
 
-			/* Advance the send_head.  This one is sent out.
-			 * This call will increment packets_out.
-			 */
-			update_send_head(sk, tp, skb);
+		TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+		if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
+			break;
+
+		/* Advance the send_head.  This one is sent out.
+		 * This call will increment packets_out.
+		 */
+		update_send_head(sk, tp, skb);
+
+		tcp_minshall_update(tp, mss_now, skb);
+		sent_pkts++;
+
+		/* Do not optimize this to use tso_segs. If we chopped up
+		 * the packet above, tso_segs will no longer be valid.
+		 */
+		cwnd_quota -= tcp_skb_pcount(skb);
+
+		BUG_ON(cwnd_quota < 0);
+		if (!cwnd_quota)
+			break;
+
+		skb = sk->sk_send_head;
+		if (!skb)
+			break;
+		tso_segs = tcp_init_tso_segs(sk, skb);
+	}
+
+	if (likely(sent_pkts)) {
+		tcp_cwnd_validate(sk, tp);
+		return 0;
+	}
+out:
+	return !tp->packets_out && sk->sk_send_head;
+}
+
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
+			       unsigned int cur_mss, int nonagle)
+{
+	struct sk_buff *skb = sk->sk_send_head;
 
-			tcp_minshall_update(tp, mss_now, skb);
-			sent_pkts = 1;
+	if (skb) {
+		if (tcp_write_xmit(sk, cur_mss, nonagle))
+			tcp_check_probe_timer(sk, tp);
+	}
+}
+
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb = sk->sk_send_head;
+	unsigned int tso_segs, cwnd_quota;
+
+	BUG_ON(!skb || skb->len < mss_now);
+
+	tso_segs = tcp_init_tso_segs(sk, skb);
+	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
+
+	if (likely(cwnd_quota)) {
+		BUG_ON(!tso_segs);
+
+		if (tso_segs > 1) {
+			u32 limit = tcp_window_allows(tp, skb,
+						      mss_now, cwnd_quota);
+
+			if (skb->len < limit) {
+				unsigned int trim = skb->len % mss_now;
+
+				if (trim)
+					limit = skb->len - trim;
+			}
+			if (skb->len > limit) {
+				if (unlikely(tso_fragment(sk, skb, limit)))
+					return;
+			}
+		} else if (unlikely(skb->len > mss_now)) {
+			if (unlikely(tcp_fragment(sk, skb, mss_now)))
+				return;
 		}
 
-		if (sent_pkts) {
+		/* Send it out now. */
+		TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+		if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+			update_send_head(sk, tp, skb);
 			tcp_cwnd_validate(sk, tp);
-			return 0;
+			return;
 		}
-
-		return !tp->packets_out && sk->sk_send_head;
 	}
-	return 0;
 }
 
 /* This function returns the amount that we can raise the
@@ -1039,7 +1369,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		if (sk->sk_route_caps & NETIF_F_TSO) {
 			sk->sk_route_caps &= ~NETIF_F_TSO;
 			sock_set_flag(sk, SOCK_NO_LARGESEND);
-			tp->mss_cache = tp->mss_cache_std;
 		}
 
 		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1101,7 +1430,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	 * is still in somebody's hands, else make a clone.
 	 */
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
-	tcp_tso_set_push(skb);
 
 	err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
 				    pskb_copy(skb, GFP_ATOMIC):
@@ -1285,7 +1613,7 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, int priority)
+void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -1670,14 +1998,12 @@ int tcp_write_wakeup(struct sock *sk)
 				if (sk->sk_route_caps & NETIF_F_TSO) {
 					sock_set_flag(sk, SOCK_NO_LARGESEND);
 					sk->sk_route_caps &= ~NETIF_F_TSO;
-					tp->mss_cache = tp->mss_cache_std;
 				}
 			} else if (!tcp_skb_pcount(skb))
 				tcp_set_skb_tso_segs(sk, skb);
 
 			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			tcp_tso_set_push(skb);
 			err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 			if (!err) {
 				update_send_head(sk, tp, skb);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b127b4498565..0084227438c2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned long data)
 	}
 	tp->ack.pending &= ~TCP_ACK_TIMER;
 
-	if (skb_queue_len(&tp->ucopy.prequeue)) {
+	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 		struct sk_buff *skb;
 
-		NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 
-				 skb_queue_len(&tp->ucopy.prequeue));
+		NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
 
 		while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
 			sk->sk_backlog_rcv(sk, skb);