From c13c8260da3155f2cefb63b0d1b7dcdcb405c644 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 17:18:44 -0700
Subject: [I/OAT]: DMA memcpy subsystem

Provides an API for offloading memory copies to DMA devices

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dmaengine.h | 337 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 337 insertions(+)
 create mode 100644 include/linux/dmaengine.h

(limited to 'include')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
new file mode 100644
index 000000000000..30781546ac99
--- /dev/null
+++ b/include/linux/dmaengine.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef DMAENGINE_H
+#define DMAENGINE_H
+#include <linux/config.h>
+#ifdef CONFIG_DMA_ENGINE
+
+#include <linux/device.h>
+#include <linux/uio.h>
+#include <linux/kref.h>
+#include <linux/completion.h>
+#include <linux/rcupdate.h>
+
+/**
+ * enum dma_event - resource PNP/power managment events
+ * @DMA_RESOURCE_SUSPEND: DMA device going into low power state
+ * @DMA_RESOURCE_RESUME: DMA device returning to full power
+ * @DMA_RESOURCE_ADDED: DMA device added to the system
+ * @DMA_RESOURCE_REMOVED: DMA device removed from the system
+ */
+enum dma_event {
+	DMA_RESOURCE_SUSPEND,
+	DMA_RESOURCE_RESUME,
+	DMA_RESOURCE_ADDED,
+	DMA_RESOURCE_REMOVED,
+};
+
+/**
+ * typedef dma_cookie_t
+ *
+ * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code
+ */
+typedef s32 dma_cookie_t;
+
+#define dma_submit_error(cookie) ((cookie) < 0 ? 1 : 0)
+
+/**
+ * enum dma_status - DMA transaction status
+ * @DMA_SUCCESS: transaction completed successfully
+ * @DMA_IN_PROGRESS: transaction not yet processed
+ * @DMA_ERROR: transaction failed
+ */
+enum dma_status {
+	DMA_SUCCESS,
+	DMA_IN_PROGRESS,
+	DMA_ERROR,
+};
+
+/**
+ * struct dma_chan_percpu - the per-CPU part of struct dma_chan
+ * @refcount: local_t used for open-coded "bigref" counting
+ * @memcpy_count: transaction counter
+ * @bytes_transferred: byte counter
+ */
+
+struct dma_chan_percpu {
+	local_t refcount;
+	/* stats */
+	unsigned long memcpy_count;
+	unsigned long bytes_transferred;
+};
+
+/**
+ * struct dma_chan - devices supply DMA channels, clients use them
+ * @client: ptr to the client user of this chan, will be NULL when unused
+ * @device: ptr to the dma device who supplies this channel, always !NULL
+ * @cookie: last cookie value returned to client
+ * @chan_id:
+ * @class_dev:
+ * @refcount: kref, used in "bigref" slow-mode
+ * @slow_ref:
+ * @rcu:
+ * @client_node: used to add this to the client chan list
+ * @device_node: used to add this to the device chan list
+ * @local: per-cpu pointer to a struct dma_chan_percpu
+ */
+struct dma_chan {
+	struct dma_client *client;
+	struct dma_device *device;
+	dma_cookie_t cookie;
+
+	/* sysfs */
+	int chan_id;
+	struct class_device class_dev;
+
+	struct kref refcount;
+	int slow_ref;
+	struct rcu_head rcu;
+
+	struct list_head client_node;
+	struct list_head device_node;
+	struct dma_chan_percpu *local;
+};
+
+void dma_chan_cleanup(struct kref *kref);
+
+static inline void dma_chan_get(struct dma_chan *chan)
+{
+	if (unlikely(chan->slow_ref))
+		kref_get(&chan->refcount);
+	else {
+		local_inc(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+		put_cpu();
+	}
+}
+
+static inline void dma_chan_put(struct dma_chan *chan)
+{
+	if (unlikely(chan->slow_ref))
+		kref_put(&chan->refcount, dma_chan_cleanup);
+	else {
+		local_dec(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
+		put_cpu();
+	}
+}
+
+/*
+ * typedef dma_event_callback - function pointer to a DMA event callback
+ */
+typedef void (*dma_event_callback) (struct dma_client *client,
+		struct dma_chan *chan, enum dma_event event);
+
+/**
+ * struct dma_client - info on the entity making use of DMA services
+ * @event_callback: func ptr to call when something happens
+ * @chan_count: number of chans allocated
+ * @chans_desired: number of chans requested. Can be +/- chan_count
+ * @lock: protects access to the channels list
+ * @channels: the list of DMA channels allocated
+ * @global_node: list_head for global dma_client_list
+ */
+struct dma_client {
+	dma_event_callback	event_callback;
+	unsigned int		chan_count;
+	unsigned int		chans_desired;
+
+	spinlock_t		lock;
+	struct list_head	channels;
+	struct list_head	global_node;
+};
+
+/**
+ * struct dma_device - info on the entity supplying DMA services
+ * @chancnt: how many DMA channels are supported
+ * @channels: the list of struct dma_chan
+ * @global_node: list_head for global dma_device_list
+ * @refcount:
+ * @done:
+ * @dev_id:
+ * Other func ptrs: used to make use of this device's capabilities
+ */
+struct dma_device {
+
+	unsigned int chancnt;
+	struct list_head channels;
+	struct list_head global_node;
+
+	struct kref refcount;
+	struct completion done;
+
+	int dev_id;
+
+	int (*device_alloc_chan_resources)(struct dma_chan *chan);
+	void (*device_free_chan_resources)(struct dma_chan *chan);
+	dma_cookie_t (*device_memcpy_buf_to_buf)(struct dma_chan *chan,
+			void *dest, void *src, size_t len);
+	dma_cookie_t (*device_memcpy_buf_to_pg)(struct dma_chan *chan,
+			struct page *page, unsigned int offset, void *kdata,
+			size_t len);
+	dma_cookie_t (*device_memcpy_pg_to_pg)(struct dma_chan *chan,
+			struct page *dest_pg, unsigned int dest_off,
+			struct page *src_pg, unsigned int src_off, size_t len);
+	enum dma_status (*device_memcpy_complete)(struct dma_chan *chan,
+			dma_cookie_t cookie, dma_cookie_t *last,
+			dma_cookie_t *used);
+	void (*device_memcpy_issue_pending)(struct dma_chan *chan);
+};
+
+/* --- public DMA engine API --- */
+
+struct dma_client *dma_async_client_register(dma_event_callback event_callback);
+void dma_async_client_unregister(struct dma_client *client);
+void dma_async_client_chan_request(struct dma_client *client,
+		unsigned int number);
+
+/**
+ * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
+ * @chan: DMA channel to offload copy to
+ * @dest: destination address (virtual)
+ * @src: source address (virtual)
+ * @len: length
+ *
+ * Both @dest and @src must be mappable to a bus address according to the
+ * DMA mapping API rules for streaming mappings.
+ * Both @dest and @src must stay memory resident (kernel memory or locked
+ * user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
+	void *dest, void *src, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @page: destination page
+ * @offset: offset in page to copy to
+ * @kdata: source address (virtual)
+ * @len: length
+ *
+ * Both @page/@offset and @kdata must be mappable to a bus address according
+ * to the DMA mapping API rules for streaming mappings.
+ * Both @page/@offset and @kdata must stay memory resident (kernel memory or
+ * locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
+	struct page *page, unsigned int offset, void *kdata, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_buf_to_pg(chan, page, offset,
+	                                             kdata, len);
+}
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy
+ * @chan: DMA channel to offload copy to
+ * @dest_page: destination page
+ * @dest_off: offset in page to copy to
+ * @src_page: source page
+ * @src_off: offset in page to copy from
+ * @len: length
+ *
+ * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
+ * address according to the DMA mapping API rules for streaming mappings.
+ * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
+ * (kernel memory or locked user space pages)
+ */
+static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
+	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
+	unsigned int src_off, size_t len)
+{
+	int cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off,
+	                                            src_pg, src_off, len);
+}
+
+/**
+ * dma_async_memcpy_issue_pending - flush pending copies to HW
+ * @chan:
+ *
+ * This allows drivers to push copies to HW in batches,
+ * reducing MMIO writes where possible.
+ */
+static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan)
+{
+	return chan->device->device_memcpy_issue_pending(chan);
+}
+
+/**
+ * dma_async_memcpy_complete - poll for transaction completion
+ * @chan: DMA channel
+ * @cookie: transaction identifier to check status of
+ * @last: returns last completed cookie, can be NULL
+ * @used: returns last issued cookie, can be NULL
+ *
+ * If @last and @used are passed in, upon return they reflect the driver
+ * internal state and can be used with dma_async_is_complete() to check
+ * the status of multiple cookies without re-checking hardware state.
+ */
+static inline enum dma_status dma_async_memcpy_complete(struct dma_chan *chan,
+	dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
+{
+	return chan->device->device_memcpy_complete(chan, cookie, last, used);
+}
+
+/**
+ * dma_async_is_complete - test a cookie against chan state
+ * @cookie: transaction identifier to test status of
+ * @last_complete: last know completed transaction
+ * @last_used: last cookie value handed out
+ *
+ * dma_async_is_complete() is used in dma_async_memcpy_complete()
+ * the test logic is seperated for lightweight testing of multiple cookies
+ */
+static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
+			dma_cookie_t last_complete, dma_cookie_t last_used)
+{
+	if (last_complete <= last_used) {
+		if ((cookie <= last_complete) || (cookie > last_used))
+			return DMA_SUCCESS;
+	} else {
+		if ((cookie <= last_complete) && (cookie > last_used))
+			return DMA_SUCCESS;
+	}
+	return DMA_IN_PROGRESS;
+}
+
+
+/* --- DMA device --- */
+
+int dma_async_device_register(struct dma_device *device);
+void dma_async_device_unregister(struct dma_device *device);
+
+#endif /* CONFIG_DMA_ENGINE */
+#endif /* DMAENGINE_H */
-- 
cgit v1.2.3


From 57c651f74cd8383df10a648e677902849de1bc0b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.sfo1.dsl.speakeasy.net>
Date: Tue, 23 May 2006 17:39:49 -0700
Subject: [I/OAT]: Move PCI_DEVICE_ID_INTEL_IOAT to linux/pci_ids.h

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dma/ioatdma.h   | 3 +--
 include/linux/pci_ids.h | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
index 312353d12af2..a5d3b3644160 100644
--- a/drivers/dma/ioatdma.h
+++ b/drivers/dma/ioatdma.h
@@ -26,8 +26,7 @@
 #include <linux/init.h>
 #include <linux/dmapool.h>
 #include <linux/cache.h>
-
-#define PCI_DEVICE_ID_INTEL_IOAT	0x1a38
+#include <linux/pci_ids.h>
 
 #define IOAT_LOW_COMPLETION_MASK	0xffffffc0
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 590dc6dca315..819c8e1324d1 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2043,6 +2043,7 @@
 #define PCI_DEVICE_ID_INTEL_80960_RP	0x1960
 #define PCI_DEVICE_ID_INTEL_82840_HB	0x1a21
 #define PCI_DEVICE_ID_INTEL_82845_HB	0x1a30
+#define PCI_DEVICE_ID_INTEL_IOAT	0x1a38
 #define PCI_DEVICE_ID_INTEL_82801AA_0	0x2410
 #define PCI_DEVICE_ID_INTEL_82801AA_1	0x2411
 #define PCI_DEVICE_ID_INTEL_82801AA_3	0x2413
-- 
cgit v1.2.3


From db21733488f84a596faaad0d05430b3f51804692 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Sat, 17 Jun 2006 21:24:58 -0700
Subject: [I/OAT]: Setup the networking subsystem as a DMA client

Attempts to allocate per-CPU DMA channels

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dma/Kconfig       |  12 ++++++
 include/linux/netdevice.h |   4 ++
 include/net/netdma.h      |  38 +++++++++++++++++
 net/core/dev.c            | 104 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 158 insertions(+)
 create mode 100644 include/net/netdma.h

(limited to 'include')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 0f15e769c6bc..30d021d1a07c 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -10,6 +10,18 @@ config DMA_ENGINE
 	  DMA engines offload copy operations from the CPU to dedicated
 	  hardware, allowing the copies to happen asynchronously.
 
+comment "DMA Clients"
+
+config NET_DMA
+	bool "Network: TCP receive copy offload"
+	depends on DMA_ENGINE && NET
+	default y
+	---help---
+	  This enables the use of DMA engines in the network stack to
+	  offload receive copy-to-user operations, freeing CPU cycles.
+	  Since this is the main user of the DMA engine, it should be enabled;
+	  say Y here.
+
 comment "DMA Devices"
 
 config INTEL_IOATDMA
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f4169bbb60eb..b5760c67af9c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -37,6 +37,7 @@
 #include <linux/config.h>
 #include <linux/device.h>
 #include <linux/percpu.h>
+#include <linux/dmaengine.h>
 
 struct divert_blk;
 struct vlan_group;
@@ -593,6 +594,9 @@ struct softnet_data
 	struct sk_buff		*completion_queue;
 
 	struct net_device	backlog_dev;	/* Sorry. 8) */
+#ifdef CONFIG_NET_DMA
+	struct dma_chan		*net_dma;
+#endif
 };
 
 DECLARE_PER_CPU(struct softnet_data,softnet_data);
diff --git a/include/net/netdma.h b/include/net/netdma.h
new file mode 100644
index 000000000000..cbfe89d7e5d0
--- /dev/null
+++ b/include/net/netdma.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef NETDMA_H
+#define NETDMA_H
+#include <linux/config.h>
+#ifdef CONFIG_NET_DMA
+#include <linux/dmaengine.h>
+
+static inline struct dma_chan *get_softnet_dma(void)
+{
+	struct dma_chan *chan;
+	rcu_read_lock();
+	chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma));
+	if (chan)
+		dma_chan_get(chan);
+	rcu_read_unlock();
+	return chan;
+}
+#endif /* CONFIG_NET_DMA */
+#endif /* NETDMA_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 4fba549caf29..6bfa78c66c25 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,6 +115,7 @@
 #include <net/iw_handler.h>
 #include <asm/current.h>
 #include <linux/audit.h>
+#include <linux/dmaengine.h>
 
 /*
  *	The list of packet types we will receive (as opposed to discard)
@@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16];	/* 16 way hashed list */
 static struct list_head ptype_all;		/* Taps */
 
+#ifdef CONFIG_NET_DMA
+static struct dma_client *net_dma_client;
+static unsigned int net_dma_count;
+static spinlock_t net_dma_event_lock;
+#endif
+
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtnl
  * semaphore.
@@ -1846,6 +1853,19 @@ static void net_rx_action(struct softirq_action *h)
 		}
 	}
 out:
+#ifdef CONFIG_NET_DMA
+	/*
+	 * There may not be any more sk_buffs coming right now, so push
+	 * any pending DMA copies to hardware
+	 */
+	if (net_dma_client) {
+		struct dma_chan *chan;
+		rcu_read_lock();
+		list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
+			dma_async_memcpy_issue_pending(chan);
+		rcu_read_unlock();
+	}
+#endif
 	local_irq_enable();
 	return;
 
@@ -3300,6 +3320,88 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+#ifdef CONFIG_NET_DMA
+/**
+ * net_dma_rebalance -
+ * This is called when the number of channels allocated to the net_dma_client
+ * changes.  The net_dma_client tries to have one DMA channel per CPU.
+ */
+static void net_dma_rebalance(void)
+{
+	unsigned int cpu, i, n;
+	struct dma_chan *chan;
+
+	lock_cpu_hotplug();
+
+	if (net_dma_count == 0) {
+		for_each_online_cpu(cpu)
+			rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL);
+		unlock_cpu_hotplug();
+		return;
+	}
+
+	i = 0;
+	cpu = first_cpu(cpu_online_map);
+
+	rcu_read_lock();
+	list_for_each_entry(chan, &net_dma_client->channels, client_node) {
+		n = ((num_online_cpus() / net_dma_count)
+		   + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
+
+		while(n) {
+			per_cpu(softnet_data.net_dma, cpu) = chan;
+			cpu = next_cpu(cpu, cpu_online_map);
+			n--;
+		}
+		i++;
+	}
+	rcu_read_unlock();
+
+	unlock_cpu_hotplug();
+}
+
+/**
+ * netdev_dma_event - event callback for the net_dma_client
+ * @client: should always be net_dma_client
+ * @chan:
+ * @event:
+ */
+static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
+	enum dma_event event)
+{
+	spin_lock(&net_dma_event_lock);
+	switch (event) {
+	case DMA_RESOURCE_ADDED:
+		net_dma_count++;
+		net_dma_rebalance();
+		break;
+	case DMA_RESOURCE_REMOVED:
+		net_dma_count--;
+		net_dma_rebalance();
+		break;
+	default:
+		break;
+	}
+	spin_unlock(&net_dma_event_lock);
+}
+
+/**
+ * netdev_dma_regiser - register the networking subsystem as a DMA client
+ */
+static int __init netdev_dma_register(void)
+{
+	spin_lock_init(&net_dma_event_lock);
+	net_dma_client = dma_async_client_register(netdev_dma_event);
+	if (net_dma_client == NULL)
+		return -ENOMEM;
+
+	dma_async_client_chan_request(net_dma_client, num_online_cpus());
+	return 0;
+}
+
+#else
+static int __init netdev_dma_register(void) { return -ENODEV; }
+#endif /* CONFIG_NET_DMA */
 
 /*
  *	Initialize the DEV module. At boot time this walks the device list and
@@ -3353,6 +3455,8 @@ static int __init net_dev_init(void)
 		atomic_set(&queue->backlog_dev.refcnt, 1);
 	}
 
+	netdev_dma_register();
+
 	dev_boot_phase = 0;
 
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
-- 
cgit v1.2.3


From de5506e155276d385712c2aa1c2d9a27cd4ed947 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 17:50:37 -0700
Subject: [I/OAT]: Utility functions for offloading sk_buff to iovec copies

Provides for pinning user space pages in memory, copying to iovecs,
and copying from sk_buffs including fragmented and chained sk_buffs.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dma/Makefile      |   3 +-
 drivers/dma/iovlock.c     | 301 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dmaengine.h |  22 ++++
 include/net/netdma.h      |   6 +
 net/core/Makefile         |   1 +
 net/core/user_dma.c       | 127 +++++++++++++++++++
 6 files changed, 459 insertions(+), 1 deletion(-)
 create mode 100644 drivers/dma/iovlock.c
 create mode 100644 net/core/user_dma.c

(limited to 'include')

diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index c8a5f5677313..bdcfdbdb1aec 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,2 +1,3 @@
-obj-y += dmaengine.o
+obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
+obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
new file mode 100644
index 000000000000..5ed327e453a2
--- /dev/null
+++ b/drivers/dma/iovlock.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/pagemap.h>
+#include <net/tcp.h> /* for memcpy_toiovec */
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+int num_pages_spanned(struct iovec *iov)
+{
+	return
+	((PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
+	((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT);
+}
+
+/*
+ * Pin down all the iovec pages needed for len bytes.
+ * Return a struct dma_pinned_list to keep track of pages pinned down.
+ *
+ * We are allocating a single chunk of memory, and then carving it up into
+ * 3 sections, the latter 2 whose size depends on the number of iovecs and the
+ * total number of pages, respectively.
+ */
+struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len)
+{
+	struct dma_pinned_list *local_list;
+	struct page **pages;
+	int i;
+	int ret;
+	int nr_iovecs = 0;
+	int iovec_len_used = 0;
+	int iovec_pages_used = 0;
+	long err;
+
+	/* don't pin down non-user-based iovecs */
+	if (segment_eq(get_fs(), KERNEL_DS))
+		return NULL;
+
+	/* determine how many iovecs/pages there are, up front */
+	do {
+		iovec_len_used += iov[nr_iovecs].iov_len;
+		iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]);
+		nr_iovecs++;
+	} while (iovec_len_used < len);
+
+	/* single kmalloc for pinned list, page_list[], and the page arrays */
+	local_list = kmalloc(sizeof(*local_list)
+		+ (nr_iovecs * sizeof (struct dma_page_list))
+		+ (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
+	if (!local_list) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* list of pages starts right after the page list array */
+	pages = (struct page **) &local_list->page_list[nr_iovecs];
+
+	for (i = 0; i < nr_iovecs; i++) {
+		struct dma_page_list *page_list = &local_list->page_list[i];
+
+		len -= iov[i].iov_len;
+
+		if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) {
+			err = -EFAULT;
+			goto unpin;
+		}
+
+		page_list->nr_pages = num_pages_spanned(&iov[i]);
+		page_list->base_address = iov[i].iov_base;
+
+		page_list->pages = pages;
+		pages += page_list->nr_pages;
+
+		/* pin pages down */
+		down_read(&current->mm->mmap_sem);
+		ret = get_user_pages(
+			current,
+			current->mm,
+			(unsigned long) iov[i].iov_base,
+			page_list->nr_pages,
+			1,	/* write */
+			0,	/* force */
+			page_list->pages,
+			NULL);
+		up_read(&current->mm->mmap_sem);
+
+		if (ret != page_list->nr_pages) {
+			err = -ENOMEM;
+			goto unpin;
+		}
+
+		local_list->nr_iovecs = i + 1;
+	}
+
+	return local_list;
+
+unpin:
+	dma_unpin_iovec_pages(local_list);
+out:
+	return ERR_PTR(err);
+}
+
+void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list)
+{
+	int i, j;
+
+	if (!pinned_list)
+		return;
+
+	for (i = 0; i < pinned_list->nr_iovecs; i++) {
+		struct dma_page_list *page_list = &pinned_list->page_list[i];
+		for (j = 0; j < page_list->nr_pages; j++) {
+			set_page_dirty_lock(page_list->pages[j]);
+			page_cache_release(page_list->pages[j]);
+		}
+	}
+
+	kfree(pinned_list);
+}
+
+static dma_cookie_t dma_memcpy_to_kernel_iovec(struct dma_chan *chan, struct
+	iovec *iov, unsigned char *kdata, size_t len)
+{
+	dma_cookie_t dma_cookie = 0;
+
+	while (len > 0) {
+		if (iov->iov_len) {
+			int copy = min_t(unsigned int, iov->iov_len, len);
+			dma_cookie = dma_async_memcpy_buf_to_buf(
+					chan,
+					iov->iov_base,
+					kdata,
+					copy);
+			kdata += copy;
+			len -= copy;
+			iov->iov_len -= copy;
+			iov->iov_base += copy;
+		}
+		iov++;
+	}
+
+	return dma_cookie;
+}
+
+/*
+ * We have already pinned down the pages we will be using in the iovecs.
+ * Each entry in iov array has corresponding entry in pinned_list->page_list.
+ * Using array indexing to keep iov[] and page_list[] in sync.
+ * Initial elements in iov array's iov->iov_len will be 0 if already copied into
+ *   by another call.
+ * iov array length remaining guaranteed to be bigger than len.
+ */
+dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len)
+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+
+	if (!chan)
+		return memcpy_toiovec(iov, kdata, len);
+
+	/* -> kernel copies (e.g. smbfs) */
+	if (!pinned_list)
+		return dma_memcpy_to_kernel_iovec(chan, iov, kdata, len);
+
+	iovec_idx = 0;
+	while (iovec_idx < pinned_list->nr_iovecs) {
+		struct dma_page_list *page_list;
+
+		/* skip already used-up iovecs */
+		while (!iov[iovec_idx].iov_len)
+			iovec_idx++;
+
+		page_list = &pinned_list->page_list[iovec_idx];
+
+		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+		/* break up copies to not cross page boundary */
+		while (iov[iovec_idx].iov_len) {
+			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+			copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+			dma_cookie = dma_async_memcpy_buf_to_pg(chan,
+					page_list->pages[page_idx],
+					iov_byte_offset,
+					kdata,
+					copy);
+
+			len -= copy;
+			iov[iovec_idx].iov_len -= copy;
+			iov[iovec_idx].iov_base += copy;
+
+			if (!len)
+				return dma_cookie;
+
+			kdata += copy;
+			iov_byte_offset = 0;
+			page_idx++;
+		}
+		iovec_idx++;
+	}
+
+	/* really bad if we ever run out of iovecs */
+	BUG();
+	return -EFAULT;
+}
+
+dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, struct page *page,
+	unsigned int offset, size_t len)
+{
+	int iov_byte_offset;
+	int copy;
+	dma_cookie_t dma_cookie = 0;
+	int iovec_idx;
+	int page_idx;
+	int err;
+
+	/* this needs as-yet-unimplemented buf-to-buff, so punt. */
+	/* TODO: use dma for this */
+	if (!chan || !pinned_list) {
+		u8 *vaddr = kmap(page);
+		err = memcpy_toiovec(iov, vaddr + offset, len);
+		kunmap(page);
+		return err;
+	}
+
+	iovec_idx = 0;
+	while (iovec_idx < pinned_list->nr_iovecs) {
+		struct dma_page_list *page_list;
+
+		/* skip already used-up iovecs */
+		while (!iov[iovec_idx].iov_len)
+			iovec_idx++;
+
+		page_list = &pinned_list->page_list[iovec_idx];
+
+		iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+		page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+			 - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+		/* break up copies to not cross page boundary */
+		while (iov[iovec_idx].iov_len) {
+			copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+			copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+			dma_cookie = dma_async_memcpy_pg_to_pg(chan,
+					page_list->pages[page_idx],
+					iov_byte_offset,
+					page,
+					offset,
+					copy);
+
+			len -= copy;
+			iov[iovec_idx].iov_len -= copy;
+			iov[iovec_idx].iov_base += copy;
+
+			if (!len)
+				return dma_cookie;
+
+			offset += copy;
+			iov_byte_offset = 0;
+			page_idx++;
+		}
+		iovec_idx++;
+	}
+
+	/* really bad if we ever run out of iovecs */
+	BUG();
+	return -EFAULT;
+}
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 30781546ac99..78b236ca04f8 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -333,5 +333,27 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
 int dma_async_device_register(struct dma_device *device);
 void dma_async_device_unregister(struct dma_device *device);
 
+/* --- Helper iov-locking functions --- */
+
+struct dma_page_list {
+	char *base_address;
+	int nr_pages;
+	struct page **pages;
+};
+
+struct dma_pinned_list {
+	int nr_iovecs;
+	struct dma_page_list page_list[0];
+};
+
+struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len);
+void dma_unpin_iovec_pages(struct dma_pinned_list* pinned_list);
+
+dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len);
+dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
+	struct dma_pinned_list *pinned_list, struct page *page,
+	unsigned int offset, size_t len);
+
 #endif /* CONFIG_DMA_ENGINE */
 #endif /* DMAENGINE_H */
diff --git a/include/net/netdma.h b/include/net/netdma.h
index cbfe89d7e5d0..19760eb131aa 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -23,6 +23,7 @@
 #include <linux/config.h>
 #ifdef CONFIG_NET_DMA
 #include <linux/dmaengine.h>
+#include <linux/skbuff.h>
 
 static inline struct dma_chan *get_softnet_dma(void)
 {
@@ -34,5 +35,10 @@ static inline struct dma_chan *get_softnet_dma(void)
 	rcu_read_unlock();
 	return chan;
 }
+
+int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
+		const struct sk_buff *skb, int offset, struct iovec *to,
+		size_t len, struct dma_pinned_list *pinned_list);
+
 #endif /* CONFIG_NET_DMA */
 #endif /* NETDMA_H */
diff --git a/net/core/Makefile b/net/core/Makefile
index 79fe12cced27..e9bd2467d5a9 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_NET_DIVERT) += dv.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_WIRELESS_EXT) += wireless.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
+obj-$(CONFIG_NET_DMA) += user_dma.o
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
new file mode 100644
index 000000000000..9eee91bcbf3f
--- /dev/null
+++ b/net/core/user_dma.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h> /* for BUG_TRAP */
+#include <net/tcp.h>
+
+/**
+ *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ *	@skb - buffer to copy
+ *	@offset - offset in the buffer to start copying from
+ *	@iovec - io vector to copy to
+ *	@len - amount of data to copy from buffer to iovec
+ *	@pinned_list - locked iovec buffer data
+ *
+ *	Note: the iovec is modified during the copy.
+ */
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+			struct sk_buff *skb, int offset, struct iovec *to,
+			size_t len, struct dma_pinned_list *pinned_list)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	dma_cookie_t cookie = 0;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
+		                            skb->data + offset, copy);
+		if (cookie < 0)
+			goto fault;
+		len -= copy;
+		if (len == 0)
+			goto end;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		BUG_TRAP(start <= offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		copy = end - offset;
+		if ((copy = end - offset) > 0) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+
+			cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
+					frag->page_offset + offset - start, copy);
+			if (cookie < 0)
+				goto fault;
+			len -= copy;
+			if (len == 0)
+				goto end;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+		for (; list; list = list->next) {
+			int end;
+
+			BUG_TRAP(start <= offset + len);
+
+			end = start + list->len;
+			copy = end - offset;
+			if (copy > 0) {
+				if (copy > len)
+					copy = len;
+				cookie = dma_skb_copy_datagram_iovec(chan, list,
+				                offset - start, to, copy,
+				                pinned_list);
+				if (cookie < 0)
+					goto fault;
+				len -= copy;
+				if (len == 0)
+					goto end;
+				offset += copy;
+			}
+			start = end;
+		}
+	}
+
+end:
+	if (!len) {
+		skb->dma_cookie = cookie;
+		return cookie;
+	}
+
+fault:
+ 	return -EFAULT;
+}
-- 
cgit v1.2.3


From 97fc2f0848c928c63c2ae619deee61a0b1107b69 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 17:55:33 -0700
Subject: [I/OAT]: Structure changes for TCP recv offload to I/OAT

Adds an async_wait_queue and some additional fields to tcp_sock, and a
dma_cookie_t to sk_buff.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 4 ++++
 include/linux/tcp.h    | 8 ++++++++
 include/net/sock.h     | 2 ++
 include/net/tcp.h      | 7 +++++++
 net/core/sock.c        | 6 ++++++
 5 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8f234708b98..23bad3bf3c9d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -29,6 +29,7 @@
 #include <linux/net.h>
 #include <linux/textsearch.h>
 #include <net/checksum.h>
+#include <linux/dmaengine.h>
 
 #define HAVE_ALLOC_SKB		/* For the drivers to know */
 #define HAVE_ALIGNABLE_SKB	/* Ditto 8)		   */
@@ -285,6 +286,9 @@ struct sk_buff {
 	__u16			tc_verd;	/* traffic control verdict */
 #endif
 #endif
+#ifdef CONFIG_NET_DMA
+	dma_cookie_t		dma_cookie;
+#endif
 
 
 	/* These elements must be at the end, see alloc_skb() for details.  */
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 542d39596bd8..c90daa5da6c3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -18,6 +18,7 @@
 #define _LINUX_TCP_H
 
 #include <linux/types.h>
+#include <linux/dmaengine.h>
 #include <asm/byteorder.h>
 
 struct tcphdr {
@@ -233,6 +234,13 @@ struct tcp_sock {
 		struct iovec		*iov;
 		int			memory;
 		int			len;
+#ifdef CONFIG_NET_DMA
+		/* members for async copy */
+		struct dma_chan		*dma_chan;
+		int			wakeup;
+		struct dma_pinned_list	*pinned_list;
+		dma_cookie_t		dma_cookie;
+#endif
 	} ucopy;
 
 	__u32	snd_wl1;	/* Sequence for window update		*/
diff --git a/include/net/sock.h b/include/net/sock.h
index c9fad6fb629b..90c65cb091a8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -132,6 +132,7 @@ struct sock_common {
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
   *	@sk_write_queue: Packet sending queue
+  *	@sk_async_wait_queue: DMA copied packets
   *	@sk_omem_alloc: "o" is "option" or "other"
   *	@sk_wmem_queued: persistent queue size
   *	@sk_forward_alloc: space allocated forward
@@ -205,6 +206,7 @@ struct sock {
 	atomic_t		sk_omem_alloc;
 	struct sk_buff_head	sk_receive_queue;
 	struct sk_buff_head	sk_write_queue;
+	struct sk_buff_head	sk_async_wait_queue;
 	int			sk_wmem_queued;
 	int			sk_forward_alloc;
 	gfp_t			sk_allocation;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3c989db8a7aa..d0c2c2fa7587 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -28,6 +28,7 @@
 #include <linux/cache.h>
 #include <linux/percpu.h>
 #include <linux/skbuff.h>
+#include <linux/dmaengine.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -817,6 +818,12 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp)
 	tp->ucopy.len = 0;
 	tp->ucopy.memory = 0;
 	skb_queue_head_init(&tp->ucopy.prequeue);
+#ifdef CONFIG_NET_DMA
+	tp->ucopy.dma_chan = NULL;
+	tp->ucopy.wakeup = 0;
+	tp->ucopy.pinned_list = NULL;
+	tp->ucopy.dma_cookie = 0;
+#endif
 }
 
 /* Packet is added to VJ-style prequeue for processing in process
diff --git a/net/core/sock.c b/net/core/sock.c
index ed2afdb9ea2d..5d820c376653 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		atomic_set(&newsk->sk_omem_alloc, 0);
 		skb_queue_head_init(&newsk->sk_receive_queue);
 		skb_queue_head_init(&newsk->sk_write_queue);
+#ifdef CONFIG_NET_DMA
+		skb_queue_head_init(&newsk->sk_async_wait_queue);
+#endif
 
 		rwlock_init(&newsk->sk_dst_lock);
 		rwlock_init(&newsk->sk_callback_lock);
@@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	skb_queue_head_init(&sk->sk_receive_queue);
 	skb_queue_head_init(&sk->sk_write_queue);
 	skb_queue_head_init(&sk->sk_error_queue);
+#ifdef CONFIG_NET_DMA
+	skb_queue_head_init(&sk->sk_async_wait_queue);
+#endif
 
 	sk->sk_send_head	=	NULL;
 
-- 
cgit v1.2.3


From 0e4b4992b8007c6b62ec143cbbb292f98813ca11 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 18:00:16 -0700
Subject: [I/OAT]: Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static

Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h |  2 ++
 net/ipv4/tcp.c    | 10 +++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d0c2c2fa7587..578cccf275d3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -294,6 +294,8 @@ extern int			tcp_rcv_established(struct sock *sk,
 
 extern void			tcp_rcv_space_adjust(struct sock *sk);
 
+extern void			tcp_cleanup_rbuf(struct sock *sk, int copied);
+
 extern int			tcp_twsk_unique(struct sock *sk,
 						struct sock *sktw, void *twp);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e2b7b8055037..1c0cfd7a8bbb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo,
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-static void cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int time_to_ack = 0;
@@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 
 	/* Clean up data we have read: This will do ACK frames. */
 	if (copied)
-		cleanup_rbuf(sk, copied);
+		tcp_cleanup_rbuf(sk, copied);
 	return copied;
 }
 
@@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			}
 		}
 
-		cleanup_rbuf(sk, copied);
+		tcp_cleanup_rbuf(sk, copied);
 
 		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
 			/* Install new reader */
@@ -1391,7 +1391,7 @@ skip_copy:
 	 */
 
 	/* Clean up data we have read: This will do ACK frames. */
-	cleanup_rbuf(sk, copied);
+	tcp_cleanup_rbuf(sk, copied);
 
 	TCP_CHECK_TIMER(sk);
 	release_sock(sk);
@@ -1858,7 +1858,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
 			    inet_csk_ack_scheduled(sk)) {
 				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
-				cleanup_rbuf(sk, 1);
+				tcp_cleanup_rbuf(sk, 1);
 				if (!(val & 1))
 					icsk->icsk_ack.pingpong = 1;
 			}
-- 
cgit v1.2.3


From 624d1164730d58a494cc5aa4afa37d02c41e83a7 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 18:01:28 -0700
Subject: [I/OAT]: Make sk_eat_skb I/OAT aware.

Add an extra argument to sk_eat_skb, and make it move early copied
packets to the async_wait_queue instead of freeing them.

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 13 ++++++++++++-
 net/dccp/proto.c   |  4 ++--
 net/ipv4/tcp.c     |  8 ++++----
 net/llc/af_llc.c   |  2 +-
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 90c65cb091a8..75b0e97ed93d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1273,11 +1273,22 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
 */
-static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
+#ifdef CONFIG_NET_DMA
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early)
+{
+	__skb_unlink(skb, &sk->sk_receive_queue);
+	if (!copied_early)
+		__kfree_skb(skb);
+	else
+		__skb_queue_tail(&sk->sk_async_wait_queue, skb);
+}
+#else
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early)
 {
 	__skb_unlink(skb, &sk->sk_receive_queue);
 	__kfree_skb(skb);
 }
+#endif
 
 extern void sock_enable_timestamp(struct sock *sk);
 extern int sock_get_timestamp(struct sock *, struct timeval __user *);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 2e0ee8355c41..5317fd3e6691 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		}
 		dccp_pr_debug("packet_type=%s\n",
 			      dccp_packet_name(dh->dccph_type));
-		sk_eat_skb(sk, skb);
+		sk_eat_skb(sk, skb, 0);
 verify_sock_status:
 		if (sock_flag(sk, SOCK_DONE)) {
 			len = 0;
@@ -773,7 +773,7 @@ verify_sock_status:
 		}
 	found_fin_ok:
 		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+			sk_eat_skb(sk, skb, 0);
 		break;
 	} while (1);
 out:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1c0cfd7a8bbb..4e067d25a63c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 				break;
 		}
 		if (skb->h.th->fin) {
-			sk_eat_skb(sk, skb);
+			sk_eat_skb(sk, skb, 0);
 			++seq;
 			break;
 		}
-		sk_eat_skb(sk, skb);
+		sk_eat_skb(sk, skb, 0);
 		if (!desc->count)
 			break;
 	}
@@ -1356,14 +1356,14 @@ skip_copy:
 		if (skb->h.th->fin)
 			goto found_fin_ok;
 		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+			sk_eat_skb(sk, skb, 0);
 		continue;
 
 	found_fin_ok:
 		/* Process the FIN. */
 		++*seq;
 		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+			sk_eat_skb(sk, skb, 0);
 		break;
 	} while (len > 0);
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5a04db745c8d..7465170a36ca 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
 			continue;
 
 		if (!(flags & MSG_PEEK)) {
-			sk_eat_skb(sk, skb);
+			sk_eat_skb(sk, skb, 0);
 			*seq = 0;
 		}
 	} while (len > 0);
-- 
cgit v1.2.3


From 9593782585e0cf70babe787a8463d492a68b1744 Mon Sep 17 00:00:00 2001
From: Chris Leech <christopher.leech@intel.com>
Date: Tue, 23 May 2006 18:02:55 -0700
Subject: [I/OAT]: Add a sysctl for tuning the I/OAT offloaded I/O threshold

Any socket recv of less than this ammount will not be offloaded

Signed-off-by: Chris Leech <christopher.leech@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h     |  1 +
 include/net/tcp.h          |  1 +
 net/core/user_dma.c        |  4 ++++
 net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++
 4 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 76eaeff76f82..cd9e7c0825ad 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -403,6 +403,7 @@ enum
  	NET_TCP_MTU_PROBING=113,
 	NET_TCP_BASE_MSS=114,
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
+	NET_TCP_DMA_COPYBREAK=116,
 };
 
 enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 578cccf275d3..f1f472746e6c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -219,6 +219,7 @@ extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
+extern int sysctl_tcp_dma_copybreak;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index 9eee91bcbf3f..b7c98dbcdb81 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -30,6 +30,10 @@
 #include <linux/rtnetlink.h> /* for BUG_TRAP */
 #include <net/tcp.h>
 
+#define NET_DMA_DEFAULT_COPYBREAK 4096
+
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+
 /**
  *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
  *	@skb - buffer to copy
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6b6c3adfcf00..6a6aa537b7aa 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -688,6 +688,16 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#ifdef CONFIG_NET_DMA
+	{
+		.ctl_name	= NET_TCP_DMA_COPYBREAK,
+		.procname	= "tcp_dma_copybreak",
+		.data		= &sysctl_tcp_dma_copybreak,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3


From aecbd4e45c2e469e0452ffb2c0b0d881e2815bb8 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 25 May 2006 15:08:30 -0700
Subject: [LLC]: use more efficient ether address routines

Use more cache efficient Ethernet address manipulation functions
in etherdevice.h.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
---
 include/net/llc_if.h | 13 ++++++-------
 net/llc/llc_if.c     |  2 --
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/llc_if.h b/include/net/llc_if.h
index 090eaa0d71f9..a05d04ac4513 100644
--- a/include/net/llc_if.h
+++ b/include/net/llc_if.h
@@ -16,6 +16,7 @@
 #include <linux/if.h>
 #include <linux/if_arp.h>
 #include <linux/llc.h>
+#include <linux/etherdevice.h>
 #include <net/llc.h>
 
 #define LLC_DATAUNIT_PRIM	1
@@ -61,8 +62,6 @@
 #define LLC_STATUS_CONFLICT	7 /* disconnect conn */
 #define LLC_STATUS_RESET_DONE	8 /*  */
 
-extern u8 llc_mac_null_var[IFHWADDRLEN];
-
 /**
  *      llc_mac_null - determines if a address is a null mac address
  *      @mac: Mac address to test if null.
@@ -70,12 +69,12 @@ extern u8 llc_mac_null_var[IFHWADDRLEN];
  *      Determines if a given address is a null mac address.  Returns 0 if the
  *      address is not a null mac, 1 if the address is a null mac.
  */
-static __inline__ int llc_mac_null(u8 *mac)
+static inline int llc_mac_null(const u8 *mac)
 {
-	return !memcmp(mac, llc_mac_null_var, IFHWADDRLEN);
+	return is_zero_ether_addr(mac);
 }
 
-static __inline__ int llc_addrany(struct llc_addr *addr)
+static inline int llc_addrany(const struct llc_addr *addr)
 {
 	return llc_mac_null(addr->mac) && !addr->lsap;
 }
@@ -89,9 +88,9 @@ static __inline__ int llc_addrany(struct llc_addr *addr)
  *	is not a complete match up to len, 1 if a complete match up to len is
  *	found.
  */
-static __inline__ int llc_mac_match(u8 *mac1, u8 *mac2)
+static inline int llc_mac_match(const u8 *mac1, const u8 *mac2)
 {
-	return !memcmp(mac1, mac2, IFHWADDRLEN);
+	return !compare_ether_addr(mac1, mac2);
 }
 
 extern int llc_establish_connection(struct sock *sk, u8 *lmac,
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index ba90f7f0801a..5ae47be7dde0 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -26,8 +26,6 @@
 #include <net/llc_c_st.h>
 #include <net/tcp_states.h>
 
-u8 llc_mac_null_var[IFHWADDRLEN];
-
 /**
  *	llc_build_and_send_pkt - Connection data sending for upper layers.
  *	@sk: connection
-- 
cgit v1.2.3


From bc0e646796928918e45b6465e02616f2fe65c3c1 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 25 May 2006 15:10:37 -0700
Subject: [LLC]: add multicast support for datagrams

Allow mulitcast reception of datagrams (similar to UDP).
All sockets bound to the same SAP receive a clone.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/llc_if.h |  4 ++++
 net/llc/llc_sap.c    | 59 +++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 55 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/llc_if.h b/include/net/llc_if.h
index a05d04ac4513..c608812a8e89 100644
--- a/include/net/llc_if.h
+++ b/include/net/llc_if.h
@@ -79,6 +79,10 @@ static inline int llc_addrany(const struct llc_addr *addr)
 	return llc_mac_null(addr->mac) && !addr->lsap;
 }
 
+static inline int llc_mac_multicast(const u8 *mac)
+{
+	return is_multicast_ether_addr(mac);
+}
 /**
  *	llc_mac_match - determines if two mac addresses are the same
  *	@mac1: First mac address to compare.
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 4029ceee9b91..20c4eb5c1ac6 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -282,7 +282,7 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb)
  *	mac, and local sap. Returns pointer for socket found, %NULL otherwise.
  */
 static struct sock *llc_lookup_dgram(struct llc_sap *sap,
-				     struct llc_addr *laddr)
+				     const struct llc_addr *laddr)
 {
 	struct sock *rc;
 	struct hlist_node *node;
@@ -304,19 +304,62 @@ found:
 	return rc;
 }
 
+/**
+ * 	llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets.
+ *	@sap: SAP
+ *	@laddr: address of local LLC (MAC + SAP)
+ *
+ *	Search socket list of the SAP and finds connections with same sap.
+ *	Deliver clone to each.
+ */
+static void llc_sap_mcast(struct llc_sap *sap,
+			  const struct llc_addr *laddr,
+			  struct sk_buff *skb)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+
+	read_lock_bh(&sap->sk_list.lock);
+	sk_for_each(sk, node, &sap->sk_list.list) {
+		struct llc_sock *llc = llc_sk(sk);
+		struct sk_buff *skb1;
+
+		if (sk->sk_type != SOCK_DGRAM)
+			continue;
+
+		if (llc->laddr.lsap != laddr->lsap)
+			continue;
+
+		skb1 = skb_clone(skb, GFP_ATOMIC);
+		if (!skb1)
+			break;
+
+		sock_hold(sk);
+		skb_set_owner_r(skb1, sk);
+		llc_sap_rcv(sap, skb1);
+		sock_put(sk);
+	}
+	read_unlock_bh(&sap->sk_list.lock);
+}
+
+
 void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb)
 {
 	struct llc_addr laddr;
-	struct sock *sk;
 
 	llc_pdu_decode_da(skb, laddr.mac);
 	llc_pdu_decode_dsap(skb, &laddr.lsap);
 
-	sk = llc_lookup_dgram(sap, &laddr);
-	if (sk) {
-		skb_set_owner_r(skb, sk);
-		llc_sap_rcv(sap, skb);
-		sock_put(sk);
-	} else
+	if (llc_mac_multicast(laddr.mac)) {
+		llc_sap_mcast(sap, &laddr, skb);
 		kfree_skb(skb);
+	} else {
+		struct sock *sk = llc_lookup_dgram(sap, &laddr);
+		if (sk) {
+			skb_set_owner_r(skb, sk);
+			llc_sap_rcv(sap, skb);
+			sock_put(sk);
+		} else
+			kfree_skb(skb);
+	}
 }
-- 
cgit v1.2.3


From 30b6c28d2aca4669f2e609ad5d77ea2a6cf0dd3a Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Fri, 26 May 2006 17:44:45 -0700
Subject: [TG3]: Add 5786 PCI ID

Add PCI ID for BCM5786 which is a variant of 5787.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tg3.c       | 2 ++
 include/linux/pci_ids.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 862c226dbbe2..cb0ebf83c843 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -229,6 +229,8 @@ static struct pci_device_id tg3_pci_tbl[] = {
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5755M,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5786,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787M,
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 819c8e1324d1..cf45e8bb69a8 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1887,6 +1887,7 @@
 #define PCI_DEVICE_ID_TIGON3_5751F	0x167e
 #define PCI_DEVICE_ID_TIGON3_5787M	0x1693
 #define PCI_DEVICE_ID_TIGON3_5782	0x1696
+#define PCI_DEVICE_ID_TIGON3_5786	0x169a
 #define PCI_DEVICE_ID_TIGON3_5787	0x169b
 #define PCI_DEVICE_ID_TIGON3_5788	0x169c
 #define PCI_DEVICE_ID_TIGON3_5789	0x169d
-- 
cgit v1.2.3


From 546be2405be119ef55467aace45f337a16e5d424 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 27 May 2006 23:03:58 -0700
Subject: [IPSEC] xfrm: Undo afinfo lock proliferation

The number of locks used to manage afinfo structures can easily be reduced
down to one each for policy and state respectively.  This is based on the
observation that the write locks are only held by module insertion/removal
which are very rare events so there is no need to further differentiate
between the insertion of modules like ipv6 versus esp6.

The removal of the read locks in xfrm4_policy.c/xfrm6_policy.c might look
suspicious at first.  However, after you realise that nobody ever takes
the corresponding write lock you'll feel better :)

As far as I can gather it's an attempt to guard against the removal of
the corresponding modules.  Since neither module can be unloaded at all
we can leave it to whoever fixes up IPv6 unloading :)

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h      |  9 +-------
 net/ipv4/xfrm4_policy.c |  6 -----
 net/ipv4/xfrm4_state.c  |  1 -
 net/ipv6/xfrm6_policy.c |  6 -----
 net/ipv6/xfrm6_state.c  |  1 -
 net/xfrm/xfrm_policy.c  | 58 +++++++++++++++++++++++++++++--------------------
 net/xfrm/xfrm_state.c   |  9 +++-----
 7 files changed, 38 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index afa508d92c93..ed7c9747059d 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -204,8 +204,7 @@ struct xfrm_type;
 struct xfrm_dst;
 struct xfrm_policy_afinfo {
 	unsigned short		family;
-	rwlock_t		lock;
-	struct xfrm_type_map	*type_map;
+	struct xfrm_type	*type_map[256];
 	struct dst_ops		*dst_ops;
 	void			(*garbage_collect)(void);
 	int			(*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl);
@@ -232,7 +231,6 @@ extern int __xfrm_state_delete(struct xfrm_state *x);
 
 struct xfrm_state_afinfo {
 	unsigned short		family;
-	rwlock_t		lock;
 	struct list_head	*state_bydst;
 	struct list_head	*state_byspi;
 	int			(*init_flags)(struct xfrm_state *x);
@@ -264,11 +262,6 @@ struct xfrm_type
 	u32			(*get_max_size)(struct xfrm_state *, int size);
 };
 
-struct xfrm_type_map {
-	rwlock_t		lock;
-	struct xfrm_type	*map[256];
-};
-
 extern int xfrm_register_type(struct xfrm_type *type, unsigned short family);
 extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family);
 extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 8604c747bca5..c0465284dfac 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -17,8 +17,6 @@
 static struct dst_ops xfrm4_dst_ops;
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
 
-static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
-
 static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 {
 	return __ip_route_output_key((struct rtable**)dst, fl);
@@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl)
 
 static inline int xfrm4_garbage_collect(void)
 {
-	read_lock(&xfrm4_policy_afinfo.lock);
 	xfrm4_policy_afinfo.garbage_collect();
-	read_unlock(&xfrm4_policy_afinfo.lock);
 	return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
 }
 
@@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = {
 
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.family = 		AF_INET,
-	.lock = 		RW_LOCK_UNLOCKED,
-	.type_map = 		&xfrm4_type_map,
 	.dst_ops =		&xfrm4_dst_ops,
 	.dst_lookup =		xfrm4_dst_lookup,
 	.find_bundle = 		__xfrm4_find_bundle,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index dbabf81a9b7b..81e1751c966e 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
 
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.family			= AF_INET,
-	.lock			= RW_LOCK_UNLOCKED,
 	.init_flags		= xfrm4_init_flags,
 	.init_tempsel		= __xfrm4_init_tempsel,
 	.state_lookup		= __xfrm4_state_lookup,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 88c840f1beb6..ee715f2691e9 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -23,8 +23,6 @@
 static struct dst_ops xfrm6_dst_ops;
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
 
-static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED };
-
 static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 {
 	int err = 0;
@@ -249,9 +247,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl)
 
 static inline int xfrm6_garbage_collect(void)
 {
-	read_lock(&xfrm6_policy_afinfo.lock);
 	xfrm6_policy_afinfo.garbage_collect();
-	read_unlock(&xfrm6_policy_afinfo.lock);
 	return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2);
 }
 
@@ -311,8 +307,6 @@ static struct dst_ops xfrm6_dst_ops = {
 
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.family =		AF_INET6,
-	.lock = 		RW_LOCK_UNLOCKED,
-	.type_map = 		&xfrm6_type_map,
 	.dst_ops =		&xfrm6_dst_ops,
 	.dst_lookup =		xfrm6_dst_lookup,
 	.find_bundle =		__xfrm6_find_bundle,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index a5723024d3b3..b33296b3f6de 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -135,7 +135,6 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 proto,
 
 static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.family			= AF_INET6,
-	.lock			= RW_LOCK_UNLOCKED,
 	.init_tempsel		= __xfrm6_init_tempsel,
 	.state_lookup		= __xfrm6_state_lookup,
 	.find_acq		= __xfrm6_find_acq,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b469c8b54613..44b64a593c01 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -46,45 +46,43 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
 
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
+static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family);
+static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo);
 
 int xfrm_register_type(struct xfrm_type *type, unsigned short family)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
-	struct xfrm_type_map *typemap;
+	struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
+	struct xfrm_type **typemap;
 	int err = 0;
 
 	if (unlikely(afinfo == NULL))
 		return -EAFNOSUPPORT;
 	typemap = afinfo->type_map;
 
-	write_lock_bh(&typemap->lock);
-	if (likely(typemap->map[type->proto] == NULL))
-		typemap->map[type->proto] = type;
+	if (likely(typemap[type->proto] == NULL))
+		typemap[type->proto] = type;
 	else
 		err = -EEXIST;
-	write_unlock_bh(&typemap->lock);
-	xfrm_policy_put_afinfo(afinfo);
+	xfrm_policy_unlock_afinfo(afinfo);
 	return err;
 }
 EXPORT_SYMBOL(xfrm_register_type);
 
 int xfrm_unregister_type(struct xfrm_type *type, unsigned short family)
 {
-	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
-	struct xfrm_type_map *typemap;
+	struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family);
+	struct xfrm_type **typemap;
 	int err = 0;
 
 	if (unlikely(afinfo == NULL))
 		return -EAFNOSUPPORT;
 	typemap = afinfo->type_map;
 
-	write_lock_bh(&typemap->lock);
-	if (unlikely(typemap->map[type->proto] != type))
+	if (unlikely(typemap[type->proto] != type))
 		err = -ENOENT;
 	else
-		typemap->map[type->proto] = NULL;
-	write_unlock_bh(&typemap->lock);
-	xfrm_policy_put_afinfo(afinfo);
+		typemap[type->proto] = NULL;
+	xfrm_policy_unlock_afinfo(afinfo);
 	return err;
 }
 EXPORT_SYMBOL(xfrm_unregister_type);
@@ -92,7 +90,7 @@ EXPORT_SYMBOL(xfrm_unregister_type);
 struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
 {
 	struct xfrm_policy_afinfo *afinfo;
-	struct xfrm_type_map *typemap;
+	struct xfrm_type **typemap;
 	struct xfrm_type *type;
 	int modload_attempted = 0;
 
@@ -102,11 +100,9 @@ retry:
 		return NULL;
 	typemap = afinfo->type_map;
 
-	read_lock(&typemap->lock);
-	type = typemap->map[proto];
+	type = typemap[proto];
 	if (unlikely(type && !try_module_get(type->owner)))
 		type = NULL;
-	read_unlock(&typemap->lock);
 	if (!type && !modload_attempted) {
 		xfrm_policy_put_afinfo(afinfo);
 		request_module("xfrm-type-%d-%d",
@@ -1306,17 +1302,31 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
 		return NULL;
 	read_lock(&xfrm_policy_afinfo_lock);
 	afinfo = xfrm_policy_afinfo[family];
-	if (likely(afinfo != NULL))
-		read_lock(&afinfo->lock);
-	read_unlock(&xfrm_policy_afinfo_lock);
+	if (unlikely(!afinfo))
+		read_unlock(&xfrm_policy_afinfo_lock);
 	return afinfo;
 }
 
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
 {
-	if (unlikely(afinfo == NULL))
-		return;
-	read_unlock(&afinfo->lock);
+	read_unlock(&xfrm_policy_afinfo_lock);
+}
+
+static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	if (unlikely(family >= NPROTO))
+		return NULL;
+	write_lock_bh(&xfrm_policy_afinfo_lock);
+	afinfo = xfrm_policy_afinfo[family];
+	if (unlikely(!afinfo))
+		write_unlock_bh(&xfrm_policy_afinfo_lock);
+	return afinfo;
+}
+
+static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo)
+{
+	write_unlock_bh(&xfrm_policy_afinfo_lock);
 }
 
 static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 93a2f36ad3db..ee62c239a7e3 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1103,17 +1103,14 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family)
 		return NULL;
 	read_lock(&xfrm_state_afinfo_lock);
 	afinfo = xfrm_state_afinfo[family];
-	if (likely(afinfo != NULL))
-		read_lock(&afinfo->lock);
-	read_unlock(&xfrm_state_afinfo_lock);
+	if (unlikely(!afinfo))
+		read_unlock(&xfrm_state_afinfo_lock);
 	return afinfo;
 }
 
 static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
 {
-	if (unlikely(afinfo == NULL))
-		return;
-	read_unlock(&afinfo->lock);
+	read_unlock(&xfrm_state_afinfo_lock);
 }
 
 /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
-- 
cgit v1.2.3


From b59f45d0b2878ab76f8053b0973654e6621828ee Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 27 May 2006 23:05:54 -0700
Subject: [IPSEC] xfrm: Abstract out encapsulation modes

This patch adds the structure xfrm_mode.  It is meant to represent
the operations carried out by transport/tunnel modes.

By doing this we allow additional encapsulation modes to be added
without clogging up the xfrm_input/xfrm_output paths.

Candidate modes include 4-to-6 tunnel mode, 6-to-4 tunnel mode, and
BEET modes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h            |   4 ++
 include/net/xfrm.h              |  17 ++++++
 net/ipv4/Kconfig                |  18 ++++++
 net/ipv4/Makefile               |   2 +
 net/ipv4/xfrm4_input.c          |  28 +--------
 net/ipv4/xfrm4_mode_transport.c |  69 ++++++++++++++++++++++
 net/ipv4/xfrm4_mode_tunnel.c    | 125 ++++++++++++++++++++++++++++++++++++++++
 net/ipv4/xfrm4_output.c         |  61 +-------------------
 net/ipv6/Kconfig                |  20 +++++++
 net/ipv6/Makefile               |   2 +
 net/ipv6/ip6_output.c           |   2 +
 net/ipv6/xfrm6_input.c          |  29 +---------
 net/ipv6/xfrm6_mode_transport.c |  73 +++++++++++++++++++++++
 net/ipv6/xfrm6_mode_tunnel.c    | 121 ++++++++++++++++++++++++++++++++++++++
 net/ipv6/xfrm6_output.c         |  63 +-------------------
 net/xfrm/xfrm_policy.c          |  83 ++++++++++++++++++++++++++
 net/xfrm/xfrm_state.c           |   6 ++
 17 files changed, 553 insertions(+), 170 deletions(-)
 create mode 100644 net/ipv4/xfrm4_mode_transport.c
 create mode 100644 net/ipv4/xfrm4_mode_tunnel.c
 create mode 100644 net/ipv6/xfrm6_mode_transport.c
 create mode 100644 net/ipv6/xfrm6_mode_tunnel.c

(limited to 'include')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 6b42cc474c01..46a15c7a1a13 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -118,6 +118,10 @@ enum
 	XFRM_SHARE_UNIQUE	/* Use once */
 };
 
+#define XFRM_MODE_TRANSPORT 0
+#define XFRM_MODE_TUNNEL 1
+#define XFRM_MODE_MAX 2
+
 /* Netlink configuration messages.  */
 enum {
 	XFRM_MSG_BASE = 0x10,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index ed7c9747059d..ed5bb34f817f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -20,6 +20,8 @@
 #include <net/ip6_fib.h>
 
 #define XFRM_ALIGN8(len)	(((len) + 7) & ~7)
+#define MODULE_ALIAS_XFRM_MODE(family, encap) \
+	MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap))
 
 extern struct sock *xfrm_nl;
 extern u32 sysctl_xfrm_aevent_etime;
@@ -164,6 +166,7 @@ struct xfrm_state
 	/* Reference to data common to all the instances of this
 	 * transformer. */
 	struct xfrm_type	*type;
+	struct xfrm_mode	*mode;
 
 	/* Security context */
 	struct xfrm_sec_ctx	*security;
@@ -205,6 +208,7 @@ struct xfrm_dst;
 struct xfrm_policy_afinfo {
 	unsigned short		family;
 	struct xfrm_type	*type_map[256];
+	struct xfrm_mode	*mode_map[XFRM_MODE_MAX];
 	struct dst_ops		*dst_ops;
 	void			(*garbage_collect)(void);
 	int			(*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl);
@@ -267,6 +271,19 @@ extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family);
 extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family);
 extern void xfrm_put_type(struct xfrm_type *type);
 
+struct xfrm_mode {
+	int (*input)(struct xfrm_state *x, struct sk_buff *skb);
+	int (*output)(struct sk_buff *skb);
+
+	struct module *owner;
+	unsigned int encap;
+};
+
+extern int xfrm_register_mode(struct xfrm_mode *mode, int family);
+extern int xfrm_unregister_mode(struct xfrm_mode *mode, int family);
+extern struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family);
+extern void xfrm_put_mode(struct xfrm_mode *mode);
+
 struct xfrm_tmpl
 {
 /* id in template is interpreted as:
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e40f75322377..35eb70b1448d 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -414,6 +414,24 @@ config INET_TUNNEL
 	tristate
 	default n
 
+config INET_XFRM_MODE_TRANSPORT
+	tristate "IP: IPsec transport mode"
+	default y
+	select XFRM
+	---help---
+	  Support for IPsec transport mode.
+
+	  If unsure, say Y.
+
+config INET_XFRM_MODE_TUNNEL
+	tristate "IP: IPsec tunnel mode"
+	default y
+	select XFRM
+	---help---
+	  Support for IPsec tunnel mode.
+
+	  If unsure, say Y.
+
 config INET_DIAG
 	tristate "INET: socket monitoring interface"
 	default y
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9ef50a0b9d2c..4cc94482eacc 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
 obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
+obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
+obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
 obj-$(CONFIG_IP_PNP) += ipconfig.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 3e174c83bfe7..817ed84511a6 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -13,7 +13,6 @@
 #include <linux/string.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
-#include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
 
@@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb)
 
 EXPORT_SYMBOL(xfrm4_rcv);
 
-static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
-{
-	struct iphdr *outer_iph = skb->nh.iph;
-	struct iphdr *inner_iph = skb->h.ipiph;
-
-	if (INET_ECN_is_ce(outer_iph->tos))
-		IP_ECN_set_ce(inner_iph);
-}
-
 static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
 {
 	switch (nexthdr) {
@@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
 
 		xfrm_vec[xfrm_nr++] = x;
 
-		iph = skb->nh.iph;
+		if (x->mode->input(x, skb))
+			goto drop;
 
 		if (x->props.mode) {
-			if (iph->protocol != IPPROTO_IPIP)
-				goto drop;
-			if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-				goto drop;
-			if (skb_cloned(skb) &&
-			    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-				goto drop;
-			if (x->props.flags & XFRM_STATE_DECAP_DSCP)
-				ipv4_copy_dscp(iph, skb->h.ipiph);
-			if (!(x->props.flags & XFRM_STATE_NOECN))
-				ipip_ecn_decapsulate(skb);
-			skb->mac.raw = memmove(skb->data - skb->mac_len,
-					       skb->mac.raw, skb->mac_len);
-			skb->nh.raw = skb->data;
-			memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
 			decaps = 1;
 			break;
 		}
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
new file mode 100644
index 000000000000..e46d9a4ccc55
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -0,0 +1,69 @@
+/*
+ * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header will be moved forward to make space for the encapsulation
+ * header.
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static int xfrm4_transport_output(struct sk_buff *skb)
+{
+	struct xfrm_state *x;
+	struct iphdr *iph;
+	int ihl;
+
+	iph = skb->nh.iph;
+	skb->h.ipiph = iph;
+
+	ihl = iph->ihl * 4;
+	skb->h.raw += ihl;
+
+	x = skb->dst->xfrm;
+	skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl);
+	return 0;
+}
+
+static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	return 0;
+}
+
+static struct xfrm_mode xfrm4_transport_mode = {
+	.input = xfrm4_transport_input,
+	.output = xfrm4_transport_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm4_transport_init(void)
+{
+	return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
+}
+
+static void __exit xfrm4_transport_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
+	BUG_ON(err);
+}
+
+module_init(xfrm4_transport_init);
+module_exit(xfrm4_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
new file mode 100644
index 000000000000..f8d880beb12f
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -0,0 +1,125 @@
+/*
+ * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
+{
+	struct iphdr *outer_iph = skb->nh.iph;
+	struct iphdr *inner_iph = skb->h.ipiph;
+
+	if (INET_ECN_is_ce(outer_iph->tos))
+		IP_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.  The following fields
+ * in it shall be filled in by x->type->output:
+ *      tot_len
+ *      check
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static int xfrm4_tunnel_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb->dst;
+	struct xfrm_state *x = dst->xfrm;
+	struct iphdr *iph, *top_iph;
+	int flags;
+
+	iph = skb->nh.iph;
+	skb->h.ipiph = iph;
+
+	skb->nh.raw = skb_push(skb, x->props.header_len);
+	top_iph = skb->nh.iph;
+
+	top_iph->ihl = 5;
+	top_iph->version = 4;
+
+	/* DS disclosed */
+	top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
+
+	flags = x->props.flags;
+	if (flags & XFRM_STATE_NOECN)
+		IP_ECN_clear(top_iph);
+
+	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
+		0 : (iph->frag_off & htons(IP_DF));
+	if (!top_iph->frag_off)
+		__ip_select_ident(top_iph, dst->child, 0);
+
+	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
+
+	top_iph->saddr = x->props.saddr.a4;
+	top_iph->daddr = x->id.daddr.a4;
+	top_iph->protocol = IPPROTO_IPIP;
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+	return 0;
+}
+
+static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+	int err = -EINVAL;
+
+	if (iph->protocol != IPPROTO_IPIP)
+		goto out;
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out;
+
+	if (skb_cloned(skb) &&
+	    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		goto out;
+
+	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+		ipv4_copy_dscp(iph, skb->h.ipiph);
+	if (!(x->props.flags & XFRM_STATE_NOECN))
+		ipip_ecn_decapsulate(skb);
+	skb->mac.raw = memmove(skb->data - skb->mac_len,
+			       skb->mac.raw, skb->mac_len);
+	skb->nh.raw = skb->data;
+	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+	err = 0;
+
+out:
+	return err;
+}
+
+static struct xfrm_mode xfrm4_tunnel_mode = {
+	.input = xfrm4_tunnel_input,
+	.output = xfrm4_tunnel_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TUNNEL,
+};
+
+static int __init xfrm4_tunnel_init(void)
+{
+	return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
+}
+
+static void __exit xfrm4_tunnel_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
+	BUG_ON(err);
+}
+
+module_init(xfrm4_tunnel_init);
+module_exit(xfrm4_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 4ef8efaf6a67..ac9d91d4bb05 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -12,67 +12,10 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/netfilter_ipv4.h>
-#include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
 #include <net/icmp.h>
 
-/* Add encapsulation header.
- *
- * In transport mode, the IP header will be moved forward to make space
- * for the encapsulation header.
- *
- * In tunnel mode, the top IP header will be constructed per RFC 2401.
- * The following fields in it shall be filled in by x->type->output:
- *	tot_len
- *	check
- *
- * On exit, skb->h will be set to the start of the payload to be processed
- * by x->type->output and skb->nh will be set to the top IP header.
- */
-static void xfrm4_encap(struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb->dst;
-	struct xfrm_state *x = dst->xfrm;
-	struct iphdr *iph, *top_iph;
-	int flags;
-
-	iph = skb->nh.iph;
-	skb->h.ipiph = iph;
-
-	skb->nh.raw = skb_push(skb, x->props.header_len);
-	top_iph = skb->nh.iph;
-
-	if (!x->props.mode) {
-		skb->h.raw += iph->ihl*4;
-		memmove(top_iph, iph, iph->ihl*4);
-		return;
-	}
-
-	top_iph->ihl = 5;
-	top_iph->version = 4;
-
-	/* DS disclosed */
-	top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
-
-	flags = x->props.flags;
-	if (flags & XFRM_STATE_NOECN)
-		IP_ECN_clear(top_iph);
-
-	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
-		0 : (iph->frag_off & htons(IP_DF));
-	if (!top_iph->frag_off)
-		__ip_select_ident(top_iph, dst->child, 0);
-
-	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
-
-	top_iph->saddr = x->props.saddr.a4;
-	top_iph->daddr = x->id.daddr.a4;
-	top_iph->protocol = IPPROTO_IPIP;
-
-	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
-}
-
 static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 {
 	int mtu, ret = 0;
@@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb)
 		if (err)
 			goto error;
 
-		xfrm4_encap(skb);
+		err = x->mode->output(skb);
+		if (err)
+			goto error;
 
 		err = x->type->output(x, skb);
 		if (err)
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index f8a107ab5592..e923d4dea418 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -106,6 +106,26 @@ config INET6_TUNNEL
 	tristate
 	default n
 
+config INET6_XFRM_MODE_TRANSPORT
+	tristate "IPv6: IPsec transport mode"
+	depends on IPV6
+	default IPV6
+	select XFRM
+	---help---
+	  Support for IPsec transport mode.
+
+	  If unsure, say Y.
+
+config INET6_XFRM_MODE_TUNNEL
+	tristate "IPv6: IPsec tunnel mode"
+	depends on IPV6
+	default IPV6
+	select XFRM
+	---help---
+	  Support for IPsec tunnel mode.
+
+	  If unsure, say Y.
+
 config IPV6_TUNNEL
 	tristate "IPv6: IPv6-in-IPv6 tunnel"
 	select INET6_TUNNEL
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index a760b0988fbb..386e0a626948 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -20,6 +20,8 @@ obj-$(CONFIG_INET6_ESP) += esp6.o
 obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
 obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
 obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
+obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o
+obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
 obj-$(CONFIG_NETFILTER)	+= netfilter/
 
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e46048974f37..416f6e428a0a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
 #include <linux/in6.h>
 #include <linux/tcp.h>
 #include <linux/route.h>
+#include <linux/module.h>
 
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
@@ -488,6 +489,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 
 	return offset;
 }
+EXPORT_SYMBOL_GPL(ip6_find_1stfragopt);
 
 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 00cfdee18dca..0405d74ff910 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -13,21 +13,9 @@
 #include <linux/string.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/xfrm.h>
 
-static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
-{
-	struct ipv6hdr *outer_iph = skb->nh.ipv6h;
-	struct ipv6hdr *inner_iph = skb->h.ipv6h;
-
-	if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
-		IP6_ECN_set_ce(inner_iph);
-}
-
 int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi)
 {
 	int err;
@@ -81,21 +69,10 @@ int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi)
 
 		xfrm_vec[xfrm_nr++] = x;
 
+		if (x->mode->input(x, skb))
+			goto drop;
+
 		if (x->props.mode) { /* XXX */
-			if (nexthdr != IPPROTO_IPV6)
-				goto drop;
-			if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-				goto drop;
-			if (skb_cloned(skb) &&
-			    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-				goto drop;
-			if (x->props.flags & XFRM_STATE_DECAP_DSCP)
-				ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h);
-			if (!(x->props.flags & XFRM_STATE_NOECN))
-				ipip6_ecn_decapsulate(skb);
-			skb->mac.raw = memmove(skb->data - skb->mac_len,
-					       skb->mac.raw, skb->mac_len);
-			skb->nh.raw = skb->data;
 			decaps = 1;
 			break;
 		}
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
new file mode 100644
index 000000000000..5efbbae08ef0
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -0,0 +1,73 @@
+/*
+ * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header and mutable extension headers will be moved forward to make
+ * space for the encapsulation header.
+ *
+ * On exit, skb->h will be set to the start of the encapsulation header to be
+ * filled in by x->type->output and skb->nh will be set to the nextheader field
+ * of the extension header directly preceding the encapsulation header, or in
+ * its absence, that of the top IP header.  The value of skb->data will always
+ * point to the top IP header.
+ */
+static int xfrm6_transport_output(struct sk_buff *skb)
+{
+	struct xfrm_state *x = skb->dst->xfrm;
+	struct ipv6hdr *iph;
+	u8 *prevhdr;
+	int hdr_len;
+
+	skb_push(skb, x->props.header_len);
+	iph = skb->nh.ipv6h;
+
+	hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
+	skb->nh.raw = prevhdr - x->props.header_len;
+	skb->h.raw = skb->data + hdr_len;
+	memmove(skb->data, iph, hdr_len);
+	return 0;
+}
+
+static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	return 0;
+}
+
+static struct xfrm_mode xfrm6_transport_mode = {
+	.input = xfrm6_transport_input,
+	.output = xfrm6_transport_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm6_transport_init(void)
+{
+	return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6);
+}
+
+static void __exit xfrm6_transport_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6);
+	BUG_ON(err);
+}
+
+module_init(xfrm6_transport_init);
+module_exit(xfrm6_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
new file mode 100644
index 000000000000..8af79be2edca
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -0,0 +1,121 @@
+/*
+ * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dsfield.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
+{
+	struct ipv6hdr *outer_iph = skb->nh.ipv6h;
+	struct ipv6hdr *inner_iph = skb->h.ipv6h;
+
+	if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
+		IP6_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.  The following fields
+ * in it shall be filled in by x->type->output:
+ *	payload_len
+ *
+ * On exit, skb->h will be set to the start of the encapsulation header to be
+ * filled in by x->type->output and skb->nh will be set to the nextheader field
+ * of the extension header directly preceding the encapsulation header, or in
+ * its absence, that of the top IP header.  The value of skb->data will always
+ * point to the top IP header.
+ */
+static int xfrm6_tunnel_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb->dst;
+	struct xfrm_state *x = dst->xfrm;
+	struct ipv6hdr *iph, *top_iph;
+	int dsfield;
+
+	skb_push(skb, x->props.header_len);
+	iph = skb->nh.ipv6h;
+
+	skb->nh.raw = skb->data;
+	top_iph = skb->nh.ipv6h;
+	skb->nh.raw = &top_iph->nexthdr;
+	skb->h.ipv6h = top_iph + 1;
+
+	top_iph->version = 6;
+	top_iph->priority = iph->priority;
+	top_iph->flow_lbl[0] = iph->flow_lbl[0];
+	top_iph->flow_lbl[1] = iph->flow_lbl[1];
+	top_iph->flow_lbl[2] = iph->flow_lbl[2];
+	dsfield = ipv6_get_dsfield(top_iph);
+	dsfield = INET_ECN_encapsulate(dsfield, dsfield);
+	if (x->props.flags & XFRM_STATE_NOECN)
+		dsfield &= ~INET_ECN_MASK;
+	ipv6_change_dsfield(top_iph, 0, dsfield);
+	top_iph->nexthdr = IPPROTO_IPV6; 
+	top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT);
+	ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
+	ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
+	return 0;
+}
+
+static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err = -EINVAL;
+
+	if (skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPV6)
+		goto out;
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		goto out;
+
+	if (skb_cloned(skb) &&
+	    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		goto out;
+
+	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+		ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h);
+	if (!(x->props.flags & XFRM_STATE_NOECN))
+		ipip6_ecn_decapsulate(skb);
+	skb->mac.raw = memmove(skb->data - skb->mac_len,
+			       skb->mac.raw, skb->mac_len);
+	skb->nh.raw = skb->data;
+	err = 0;
+
+out:
+	return err;
+}
+
+static struct xfrm_mode xfrm6_tunnel_mode = {
+	.input = xfrm6_tunnel_input,
+	.output = xfrm6_tunnel_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TUNNEL,
+};
+
+static int __init xfrm6_tunnel_init(void)
+{
+	return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6);
+}
+
+static void __exit xfrm6_tunnel_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6);
+	BUG_ON(err);
+}
+
+module_init(xfrm6_tunnel_init);
+module_exit(xfrm6_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 80242172a5df..16e84254a252 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -14,68 +14,9 @@
 #include <linux/spinlock.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter_ipv6.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
 #include <net/ipv6.h>
 #include <net/xfrm.h>
 
-/* Add encapsulation header.
- *
- * In transport mode, the IP header and mutable extension headers will be moved
- * forward to make space for the encapsulation header.
- *
- * In tunnel mode, the top IP header will be constructed per RFC 2401.
- * The following fields in it shall be filled in by x->type->output:
- *	payload_len
- *
- * On exit, skb->h will be set to the start of the encapsulation header to be
- * filled in by x->type->output and skb->nh will be set to the nextheader field
- * of the extension header directly preceding the encapsulation header, or in
- * its absence, that of the top IP header.  The value of skb->data will always
- * point to the top IP header.
- */
-static void xfrm6_encap(struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb->dst;
-	struct xfrm_state *x = dst->xfrm;
-	struct ipv6hdr *iph, *top_iph;
-	int dsfield;
-
-	skb_push(skb, x->props.header_len);
-	iph = skb->nh.ipv6h;
-
-	if (!x->props.mode) {
-		u8 *prevhdr;
-		int hdr_len;
-
-		hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
-		skb->nh.raw = prevhdr - x->props.header_len;
-		skb->h.raw = skb->data + hdr_len;
-		memmove(skb->data, iph, hdr_len);
-		return;
-	}
-
-	skb->nh.raw = skb->data;
-	top_iph = skb->nh.ipv6h;
-	skb->nh.raw = &top_iph->nexthdr;
-	skb->h.ipv6h = top_iph + 1;
-
-	top_iph->version = 6;
-	top_iph->priority = iph->priority;
-	top_iph->flow_lbl[0] = iph->flow_lbl[0];
-	top_iph->flow_lbl[1] = iph->flow_lbl[1];
-	top_iph->flow_lbl[2] = iph->flow_lbl[2];
-	dsfield = ipv6_get_dsfield(top_iph);
-	dsfield = INET_ECN_encapsulate(dsfield, dsfield);
-	if (x->props.flags & XFRM_STATE_NOECN)
-		dsfield &= ~INET_ECN_MASK;
-	ipv6_change_dsfield(top_iph, 0, dsfield);
-	top_iph->nexthdr = IPPROTO_IPV6; 
-	top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT);
-	ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
-	ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
-}
-
 static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 {
 	int mtu, ret = 0;
@@ -118,7 +59,9 @@ static int xfrm6_output_one(struct sk_buff *skb)
 		if (err)
 			goto error;
 
-		xfrm6_encap(skb);
+		err = x->mode->output(skb);
+		if (err)
+			goto error;
 
 		err = x->type->output(x, skb);
 		if (err)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 44b64a593c01..b8936926c24b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -138,6 +138,89 @@ void xfrm_put_type(struct xfrm_type *type)
 	module_put(type->owner);
 }
 
+int xfrm_register_mode(struct xfrm_mode *mode, int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	struct xfrm_mode **modemap;
+	int err;
+
+	if (unlikely(mode->encap >= XFRM_MODE_MAX))
+		return -EINVAL;
+
+	afinfo = xfrm_policy_lock_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return -EAFNOSUPPORT;
+
+	err = -EEXIST;
+	modemap = afinfo->mode_map;
+	if (likely(modemap[mode->encap] == NULL)) {
+		modemap[mode->encap] = mode;
+		err = 0;
+	}
+
+	xfrm_policy_unlock_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_register_mode);
+
+int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	struct xfrm_mode **modemap;
+	int err;
+
+	if (unlikely(mode->encap >= XFRM_MODE_MAX))
+		return -EINVAL;
+
+	afinfo = xfrm_policy_lock_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return -EAFNOSUPPORT;
+
+	err = -ENOENT;
+	modemap = afinfo->mode_map;
+	if (likely(modemap[mode->encap] == mode)) {
+		modemap[mode->encap] = NULL;
+		err = 0;
+	}
+
+	xfrm_policy_unlock_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_unregister_mode);
+
+struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	struct xfrm_mode *mode;
+	int modload_attempted = 0;
+
+	if (unlikely(encap >= XFRM_MODE_MAX))
+		return NULL;
+
+retry:
+	afinfo = xfrm_policy_get_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return NULL;
+
+	mode = afinfo->mode_map[encap];
+	if (unlikely(mode && !try_module_get(mode->owner)))
+		mode = NULL;
+	if (!mode && !modload_attempted) {
+		xfrm_policy_put_afinfo(afinfo);
+		request_module("xfrm-mode-%d-%d", family, encap);
+		modload_attempted = 1;
+		goto retry;
+	}
+
+	xfrm_policy_put_afinfo(afinfo);
+	return mode;
+}
+
+void xfrm_put_mode(struct xfrm_mode *mode)
+{
+	module_put(mode->owner);
+}
+
 static inline unsigned long make_jiffies(long secs)
 {
 	if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index ee62c239a7e3..17b29ec3c417 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -77,6 +77,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 	kfree(x->ealg);
 	kfree(x->calg);
 	kfree(x->encap);
+	if (x->mode)
+		xfrm_put_mode(x->mode);
 	if (x->type) {
 		x->type->destructor(x);
 		xfrm_put_type(x->type);
@@ -1193,6 +1195,10 @@ int xfrm_init_state(struct xfrm_state *x)
 	if (err)
 		goto error;
 
+	x->mode = xfrm_get_mode(x->props.mode, family);
+	if (x->mode == NULL)
+		goto error;
+
 	x->km.state = XFRM_STATE_VALID;
 
 error:
-- 
cgit v1.2.3


From 73654d61e556483ad324b90989eae26b22df6ef6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 27 May 2006 23:06:33 -0700
Subject: [IPSEC] xfrm: Use IPPROTO_MAX instead of 256

The size of the type_map array (256) comes from the number of IP protocols,
i.e., IPPROTO_MAX.  This patch is based on a suggestion from Ingo Oeser.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index ed5bb34f817f..9c5ee9f20b65 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -207,7 +207,7 @@ struct xfrm_type;
 struct xfrm_dst;
 struct xfrm_policy_afinfo {
 	unsigned short		family;
-	struct xfrm_type	*type_map[256];
+	struct xfrm_type	*type_map[IPPROTO_MAX];
 	struct xfrm_mode	*mode_map[XFRM_MODE_MAX];
 	struct dst_ops		*dst_ops;
 	void			(*garbage_collect)(void);
-- 
cgit v1.2.3


From 62b7743483b402f8fb73545d5d487ca714e82766 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 29 May 2006 18:20:32 -0700
Subject: [NETFILTER]: x_tables: add quota match

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_quota.h | 16 +++++++
 net/netfilter/Kconfig              | 10 ++++
 net/netfilter/Makefile             |  1 +
 net/netfilter/xt_quota.c           | 96 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+)
 create mode 100644 include/linux/netfilter/xt_quota.h
 create mode 100644 net/netfilter/xt_quota.c

(limited to 'include')

diff --git a/include/linux/netfilter/xt_quota.h b/include/linux/netfilter/xt_quota.h
new file mode 100644
index 000000000000..acd7fd77bbee
--- /dev/null
+++ b/include/linux/netfilter/xt_quota.h
@@ -0,0 +1,16 @@
+#ifndef _XT_QUOTA_H
+#define _XT_QUOTA_H
+
+enum xt_quota_flags {
+	XT_QUOTA_INVERT		= 0x1,
+};
+#define XT_QUOTA_MASK		0x1
+
+struct xt_quota_info {
+	u_int32_t		flags;
+	u_int32_t		pad;
+	aligned_u64		quota;
+	struct xt_quota_info	*master;
+};
+
+#endif /* _XT_QUOTA_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e2893effdfaa..5543c7b74535 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -329,6 +329,16 @@ config NETFILTER_XT_MATCH_PKTTYPE
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_MATCH_QUOTA
+	tristate '"quota" match support'
+	depends on NETFILTER_XTABLES
+	help
+	  This option adds a `quota' match, which allows to match on a
+	  byte counter.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
 config NETFILTER_XT_MATCH_REALM
 	tristate  '"realm" match support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 95b7e416512d..4b6a6ea07373 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
new file mode 100644
index 000000000000..4cdba7469dc4
--- /dev/null
+++ b/net/netfilter/xt_quota.c
@@ -0,0 +1,96 @@
+/*
+ * netfilter module to enforce network quotas
+ *
+ * Sam Johnston <samj@samj.net>
+ */
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_quota.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
+
+static DEFINE_SPINLOCK(quota_lock);
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in, const struct net_device *out,
+      const struct xt_match *match, const void *matchinfo,
+      int offset, unsigned int protoff, int *hotdrop)
+{
+	struct xt_quota_info *q = ((struct xt_quota_info *)matchinfo)->master;
+	int ret = q->flags & XT_QUOTA_INVERT ? 1 : 0;
+
+	spin_lock_bh(&quota_lock);
+	if (q->quota >= skb->len) {
+		q->quota -= skb->len;
+		ret ^= 1;
+	} else {
+	        /* we do not allow even small packets from now on */
+	        q->quota = 0;
+	}
+	spin_unlock_bh(&quota_lock);
+
+	return ret;
+}
+
+static int
+checkentry(const char *tablename, const void *entry,
+	   const struct xt_match *match, void *matchinfo,
+	   unsigned int matchsize, unsigned int hook_mask)
+{
+	struct xt_quota_info *q = (struct xt_quota_info *)matchinfo;
+
+	if (q->flags & ~XT_QUOTA_MASK)
+		return 0;
+	/* For SMP, we only want to use one set of counters. */
+	q->master = q;
+	return 1;
+}
+
+static struct xt_match quota_match = {
+	.name		= "quota",
+	.family		= AF_INET,
+	.match		= match,
+	.matchsize	= sizeof(struct xt_quota_info),
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE
+};
+
+static struct xt_match quota_match6 = {
+	.name		= "quota",
+	.family		= AF_INET6,
+	.match		= match,
+	.matchsize	= sizeof(struct xt_quota_info),
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE
+};
+
+static int __init xt_quota_init(void)
+{
+	int ret;
+
+	ret = xt_register_match(&quota_match);
+	if (ret)
+		goto err1;
+	ret = xt_register_match(&quota_match6);
+	if (ret)
+		goto err2;
+	return ret;
+
+err2:
+	xt_unregister_match(&quota_match);
+err1:
+	return ret;
+}
+
+static void __exit xt_quota_fini(void)
+{
+	xt_unregister_match(&quota_match6);
+	xt_unregister_match(&quota_match);
+}
+
+module_init(xt_quota_init);
+module_exit(xt_quota_fini);
-- 
cgit v1.2.3


From f3389805e53a13bd969ee1c8fc5a4137b7c6c167 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 29 May 2006 18:21:00 -0700
Subject: [NETFILTER]: x_tables: add statistic match

Add statistic match which is a combination of the nth and random matches.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_statistic.h |  32 ++++++++++
 net/netfilter/Kconfig                  |   6 ++
 net/netfilter/Makefile                 |   1 +
 net/netfilter/xt_statistic.c           | 112 +++++++++++++++++++++++++++++++++
 4 files changed, 151 insertions(+)
 create mode 100644 include/linux/netfilter/xt_statistic.h
 create mode 100644 net/netfilter/xt_statistic.c

(limited to 'include')

diff --git a/include/linux/netfilter/xt_statistic.h b/include/linux/netfilter/xt_statistic.h
new file mode 100644
index 000000000000..c344e9916e23
--- /dev/null
+++ b/include/linux/netfilter/xt_statistic.h
@@ -0,0 +1,32 @@
+#ifndef _XT_STATISTIC_H
+#define _XT_STATISTIC_H
+
+enum xt_statistic_mode {
+	XT_STATISTIC_MODE_RANDOM,
+	XT_STATISTIC_MODE_NTH,
+	__XT_STATISTIC_MODE_MAX
+};
+#define XT_STATISTIC_MODE_MAX (__XT_STATISTIC_MODE_MAX - 1)
+
+enum xt_statistic_flags {
+	XT_STATISTIC_INVERT		= 0x1,
+};
+#define XT_STATISTIC_MASK		0x1
+
+struct xt_statistic_info {
+	u_int16_t			mode;
+	u_int16_t			flags;
+	union {
+		struct {
+			u_int32_t	probability;
+		} random;
+		struct {
+			u_int32_t	every;
+			u_int32_t	packet;
+			u_int32_t	count;
+		} nth;
+	} u;
+	struct xt_statistic_info	*master __attribute__((aligned(8)));
+};
+
+#endif /* _XT_STATISTIC_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 5543c7b74535..85a7e1770252 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -375,6 +375,12 @@ config NETFILTER_XT_MATCH_STATE
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_MATCH_STATISTIC
+	tristate '"statistic" match support'
+	depends on NETFILTER_XTABLES
+	help
+	  statistic module
+
 config NETFILTER_XT_MATCH_STRING
 	tristate  '"string" match support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 4b6a6ea07373..df1da25f40df 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
new file mode 100644
index 000000000000..de1037f58596
--- /dev/null
+++ b/net/netfilter/xt_statistic.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on ipt_random and ipt_nth by Fabrice MARIE <fabrice@netfilter.org>.
+ */
+
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+
+#include <linux/netfilter/xt_statistic.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("xtables statistical match module");
+MODULE_ALIAS("ipt_statistic");
+MODULE_ALIAS("ip6t_statistic");
+
+static DEFINE_SPINLOCK(nth_lock);
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in, const struct net_device *out,
+      const struct xt_match *match, const void *matchinfo,
+      int offset, unsigned int protoff, int *hotdrop)
+{
+	struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
+	int ret = info->flags & XT_STATISTIC_INVERT ? 1 : 0;
+
+	switch (info->mode) {
+	case XT_STATISTIC_MODE_RANDOM:
+		if ((net_random() & 0x7FFFFFFF) < info->u.random.probability)
+			ret ^= 1;
+		break;
+	case XT_STATISTIC_MODE_NTH:
+		info = info->master;
+		spin_lock_bh(&nth_lock);
+		if (info->u.nth.count++ == info->u.nth.every) {
+			info->u.nth.count = 0;
+			ret ^= 1;
+		}
+		spin_unlock_bh(&nth_lock);
+		break;
+	}
+
+	return ret;
+}
+
+static int
+checkentry(const char *tablename, const void *entry,
+	   const struct xt_match *match, void *matchinfo,
+	   unsigned int matchsize, unsigned int hook_mask)
+{
+	struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
+
+	if (info->mode > XT_STATISTIC_MODE_MAX ||
+	    info->flags & ~XT_STATISTIC_MASK)
+		return 0;
+	info->master = info;
+	return 1;
+}
+
+static struct xt_match statistic_match = {
+	.name		= "statistic",
+	.match		= match,
+	.matchsize	= sizeof(struct xt_statistic_info),
+	.checkentry	= checkentry,
+	.family		= AF_INET,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_match statistic_match6 = {
+	.name		= "statistic",
+	.match		= match,
+	.matchsize	= sizeof(struct xt_statistic_info),
+	.checkentry	= checkentry,
+	.family		= AF_INET6,
+	.me		= THIS_MODULE,
+};
+
+static int __init xt_statistic_init(void)
+{
+	int ret;
+
+	ret = xt_register_match(&statistic_match);
+	if (ret)
+		goto err1;
+
+	ret = xt_register_match(&statistic_match6);
+	if (ret)
+		goto err2;
+	return ret;
+err2:
+	xt_unregister_match(&statistic_match);
+err1:
+	return ret;
+}
+
+static void __exit xt_statistic_fini(void)
+{
+	xt_unregister_match(&statistic_match6);
+	xt_unregister_match(&statistic_match);
+}
+
+module_init(xt_statistic_init);
+module_exit(xt_statistic_fini);
-- 
cgit v1.2.3


From 39a27a35c5c1b5be499a0576a35c45a011788bf8 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 29 May 2006 18:23:54 -0700
Subject: [NETFILTER]: conntrack: add sysctl to disable checksumming

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h    |  1 +
 include/linux/sysctl.h                         |  2 ++
 include/net/netfilter/nf_conntrack.h           |  1 +
 net/ipv4/netfilter/ip_conntrack_proto_icmp.c   |  2 +-
 net/ipv4/netfilter/ip_conntrack_proto_tcp.c    |  2 +-
 net/ipv4/netfilter/ip_conntrack_proto_udp.c    |  2 +-
 net/ipv4/netfilter/ip_conntrack_standalone.c   | 11 +++++++++++
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  2 +-
 net/netfilter/nf_conntrack_proto_tcp.c         |  5 +++--
 net/netfilter/nf_conntrack_proto_udp.c         |  3 ++-
 net/netfilter/nf_conntrack_standalone.c        | 11 +++++++++++
 12 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index d54d7b278e96..5473c01f69ea 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -293,6 +293,7 @@ static inline int is_dying(struct ip_conntrack *ct)
 }
 
 extern unsigned int ip_conntrack_htable_size;
+extern int ip_conntrack_checksum;
  
 #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++)
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index cd9e7c0825ad..98338ed2c0b6 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -313,6 +313,7 @@ enum
 	NET_NF_CONNTRACK_FRAG6_TIMEOUT=29,
 	NET_NF_CONNTRACK_FRAG6_LOW_THRESH=30,
 	NET_NF_CONNTRACK_FRAG6_HIGH_THRESH=31,
+	NET_NF_CONNTRACK_CHECKSUM=32,
 };
 
 /* /proc/sys/net/ipv4 */
@@ -492,6 +493,7 @@ enum
  	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25,
  	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26,
 	NET_IPV4_NF_CONNTRACK_COUNT=27,
+	NET_IPV4_NF_CONNTRACK_CHECKSUM=28,
 };
  
 /* /proc/sys/net/ipv6 */
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 916013ca4a5c..dbe7a114d0c5 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -285,6 +285,7 @@ static inline int nf_ct_is_dying(struct nf_conn *ct)
 }
 
 extern unsigned int nf_conntrack_htable_size;
+extern int nf_conntrack_checksum;
 
 #define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++)
 
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index d8b14a9010a6..23f1c504586d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	}
 
 	/* See ip_conntrack_proto_tcp.c */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) {
 		if (LOG_INVALID(IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 062b252b58ad..c5c2ce5cdeb8 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb,
 	 * and moreover root might send raw packets.
 	 */
 	/* FIXME: Source route IP option packets --RR */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) {
 		if (LOG_INVALID(IPPROTO_TCP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 70899868783b..9b2c16b4d2ff 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	 * because the semantic of CHECKSUM_HW is different there 
 	 * and moreover root might send raw packets.
 	 * FIXME: Source route IP option packets --RR */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) {
 		if (LOG_INVALID(IPPROTO_UDP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index f0cc7feb0da3..6cb9b989d14c 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -564,6 +564,8 @@ extern unsigned int ip_ct_generic_timeout;
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
 
+int ip_conntrack_checksum = 1;
+
 static struct ctl_table_header *ip_ct_sysctl_header;
 
 static ctl_table ip_ct_sysctl_table[] = {
@@ -591,6 +593,14 @@ static ctl_table ip_ct_sysctl_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_CHECKSUM,
+		.procname	= "ip_conntrack_checksum",
+		.data		= &ip_conntrack_checksum,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
 		.procname	= "ip_conntrack_tcp_timeout_syn_sent",
@@ -946,6 +956,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
 EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
 EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
 EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
+EXPORT_SYMBOL_GPL(ip_conntrack_checksum);
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
 EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 4b0d361cc6e6..663a73ee3f2f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
 	}
 
 	/* See ip_conntrack_proto_tcp.c */
-	if (hooknum == NF_IP_PRE_ROUTING &&
+	if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, dataoff, 0)) {
 		if (LOG_INVALID(IPPROTO_ICMP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 86c6703265d0..ef18a7b7014b 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -233,7 +233,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff,
 		return -NF_ACCEPT;
 	}
 
-	if (hooknum == NF_IP6_PRE_ROUTING &&
+	if (nf_conntrack_checksum && hooknum == NF_IP6_PRE_ROUTING &&
 	    nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
 		nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
 			      "nf_ct_icmpv6: ICMPv6 checksum failed\n");
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 69899f27d26a..12fb7c0a1509 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -828,8 +828,9 @@ static int tcp_error(struct sk_buff *skb,
 	 * and moreover root might send raw packets.
 	 */
 	/* FIXME: Source route IP option packets --RR */
-	if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
-	     (pf == PF_INET6 && hooknum  == NF_IP6_PRE_ROUTING)) &&
+	if (nf_conntrack_checksum &&
+	    ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
+	     (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
 		if (LOG_INVALID(IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index d93edbfde9e3..ae07ebe3ab37 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -134,7 +134,8 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff,
 	 * because the semantic of CHECKSUM_HW is different there
 	 * and moreover root might send raw packets.
 	 * FIXME: Source route IP option packets --RR */
-	if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
+	if (nf_conntrack_checksum &&
+	    ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
 	     (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) &&
 	    nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
 		if (LOG_INVALID(IPPROTO_UDP))
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 408960c6a544..e01d20d8e287 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -455,6 +455,8 @@ extern unsigned int nf_ct_generic_timeout;
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
 
+int nf_conntrack_checksum = 1;
+
 static struct ctl_table_header *nf_ct_sysctl_header;
 
 static ctl_table nf_ct_sysctl_table[] = {
@@ -482,6 +484,14 @@ static ctl_table nf_ct_sysctl_table[] = {
 		.mode           = 0444,
 		.proc_handler   = &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_NF_CONNTRACK_CHECKSUM,
+		.procname	= "nf_conntrack_checksum",
+		.data		= &nf_conntrack_checksum,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
 		.procname	= "nf_conntrack_tcp_timeout_syn_sent",
@@ -851,6 +861,7 @@ EXPORT_SYMBOL(nf_ct_proto_put);
 EXPORT_SYMBOL(nf_ct_l3proto_find_get);
 EXPORT_SYMBOL(nf_ct_l3proto_put);
 EXPORT_SYMBOL(nf_ct_l3protos);
+EXPORT_SYMBOL_GPL(nf_conntrack_checksum);
 EXPORT_SYMBOL(nf_conntrack_expect_alloc);
 EXPORT_SYMBOL(nf_conntrack_expect_put);
 EXPORT_SYMBOL(nf_conntrack_expect_related);
-- 
cgit v1.2.3


From 997ae831ade74bdaed4172b1c02060b9efd6e206 Mon Sep 17 00:00:00 2001
From: Eric Leblond <eric@inl.fr>
Date: Mon, 29 May 2006 18:24:20 -0700
Subject: [NETFILTER]: conntrack: add fixed timeout flag in connection tracking

Add a flag in a connection status to have a non updated timeout.
This permits to have connection that automatically die at a given
time.

Signed-off-by: Eric Leblond <eric@inl.fr>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nf_conntrack_common.h | 4 ++++
 net/ipv4/netfilter/ip_conntrack_core.c        | 6 ++++++
 net/netfilter/nf_conntrack_core.c             | 6 ++++++
 3 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 3ff88c878308..d2e4bd7a7a14 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -69,6 +69,10 @@ enum ip_conntrack_status {
 	/* Connection is dying (removed from lists), can not be unset. */
 	IPS_DYING_BIT = 9,
 	IPS_DYING = (1 << IPS_DYING_BIT),
+
+	/* Connection has fixed timeout. */
+	IPS_FIXED_TIMEOUT_BIT = 10,
+	IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT),
 };
 
 /* Connection tracking event bits */
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a297da7bbef5..4fe9e69378df 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -1130,6 +1130,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct,
 
 	write_lock_bh(&ip_conntrack_lock);
 
+	/* Only update if this is not a fixed timeout */
+	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
+		write_unlock_bh(&ip_conntrack_lock);
+		return;
+	}
+
 	/* If not in hash table, timer will not be active yet */
 	if (!is_confirmed(ct)) {
 		ct->timeout.expires = extra_jiffies;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f9b83f91371a..bc2bd4c3859e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1396,6 +1396,12 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
 
 	write_lock_bh(&nf_conntrack_lock);
 
+	/* Only update if this is not a fixed timeout */
+	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
+		write_unlock_bh(&nf_conntrack_lock);
+		return;
+	}
+
 	/* If not in hash table, timer will not be active yet */
 	if (!nf_ct_is_confirmed(ct)) {
 		ct->timeout.expires = extra_jiffies;
-- 
cgit v1.2.3


From 3726add76643c715d437aceda320d319153b6113 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 29 May 2006 18:24:39 -0700
Subject: [NETFILTER]: ctnetlink: fix NAT configuration

The current configuration only allows to configure one manip and overloads
conntrack status flags with netlink semantic.

Signed-off-by: Patrick Mchardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink_conntrack.h |  4 +-
 net/ipv4/netfilter/ip_conntrack_netlink.c     | 53 +++++++++++----------------
 net/netfilter/nf_conntrack_netlink.c          | 53 +++++++++++----------------
 3 files changed, 47 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 668ec946c8e2..b5883ccee295 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -27,13 +27,15 @@ enum ctattr_type {
 	CTA_STATUS,
 	CTA_PROTOINFO,
 	CTA_HELP,
-	CTA_NAT,
+	CTA_NAT_SRC,
+#define CTA_NAT	CTA_NAT_SRC	/* backwards compatibility */
 	CTA_TIMEOUT,
 	CTA_MARK,
 	CTA_COUNTERS_ORIG,
 	CTA_COUNTERS_REPLY,
 	CTA_USE,
 	CTA_ID,
+	CTA_NAT_DST,
 	__CTA_MAX
 };
 #define CTA_MAX (__CTA_MAX - 1)
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 01bd7cab9367..af152e3623dc 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -629,7 +629,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = {
 };
 
 static inline int
-ctnetlink_parse_nat(struct nfattr *cda[],
+ctnetlink_parse_nat(struct nfattr *nat,
 		    const struct ip_conntrack *ct, struct ip_nat_range *range)
 {
 	struct nfattr *tb[CTA_NAT_MAX];
@@ -639,7 +639,7 @@ ctnetlink_parse_nat(struct nfattr *cda[],
 
 	memset(range, 0, sizeof(*range));
 	
-	nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
+	nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
 
 	if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
 		return -EINVAL;
@@ -854,39 +854,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
 		/* ASSURED bit can only be set */
 		return -EINVAL;
 
-	if (cda[CTA_NAT-1]) {
+	if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 #ifndef CONFIG_IP_NF_NAT_NEEDED
 		return -EINVAL;
 #else
-		unsigned int hooknum;
 		struct ip_nat_range range;
 
-		if (ctnetlink_parse_nat(cda, ct, &range) < 0)
-			return -EINVAL;
-
-		DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 
-		       NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
-		       htons(range.min.all), htons(range.max.all));
-		
-		/* This is tricky but it works. ip_nat_setup_info needs the
-		 * hook number as parameter, so let's do the correct 
-		 * conversion and run away */
-		if (status & IPS_SRC_NAT_DONE)
-			hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
-		else if (status & IPS_DST_NAT_DONE)
-			hooknum = NF_IP_PRE_ROUTING;  /* IP_NAT_MANIP_DST */
-		else 
-			return -EINVAL; /* Missing NAT flags */
-
-		DEBUGP("NAT status: %lu\n", 
-		       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
-		
-		if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
-			return -EEXIST;
-		ip_nat_setup_info(ct, &range, hooknum);
-
-                DEBUGP("NAT status after setup_info: %lu\n",
-                       ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+		if (cda[CTA_NAT_DST-1]) {
+			if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
+						&range) < 0)
+				return -EINVAL;
+			if (ip_nat_initialized(ct,
+					       HOOK2MANIP(NF_IP_PRE_ROUTING)))
+				return -EEXIST;
+			ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
+		}
+		if (cda[CTA_NAT_SRC-1]) {
+			if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
+						&range) < 0)
+				return -EINVAL;
+			if (ip_nat_initialized(ct,
+					       HOOK2MANIP(NF_IP_POST_ROUTING)))
+				return -EEXIST;
+			ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+		}
 #endif
 	}
 
@@ -1106,7 +1097,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	/* implicit 'else' */
 
 	/* we only allow nat config for new conntracks */
-	if (cda[CTA_NAT-1]) {
+	if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index bd10eb944b65..8f27fe9446f2 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -641,7 +641,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = {
 };
 
 static inline int
-ctnetlink_parse_nat(struct nfattr *cda[],
+ctnetlink_parse_nat(struct nfattr *nat,
 		    const struct nf_conn *ct, struct ip_nat_range *range)
 {
 	struct nfattr *tb[CTA_NAT_MAX];
@@ -651,7 +651,7 @@ ctnetlink_parse_nat(struct nfattr *cda[],
 
 	memset(range, 0, sizeof(*range));
 	
-	nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
+	nfattr_parse_nested(tb, CTA_NAT_MAX, nat);
 
 	if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
 		return -EINVAL;
@@ -866,39 +866,30 @@ ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[])
 		/* ASSURED bit can only be set */
 		return -EINVAL;
 
-	if (cda[CTA_NAT-1]) {
+	if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 #ifndef CONFIG_IP_NF_NAT_NEEDED
 		return -EINVAL;
 #else
-		unsigned int hooknum;
 		struct ip_nat_range range;
 
-		if (ctnetlink_parse_nat(cda, ct, &range) < 0)
-			return -EINVAL;
-
-		DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 
-		       NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
-		       htons(range.min.all), htons(range.max.all));
-		
-		/* This is tricky but it works. ip_nat_setup_info needs the
-		 * hook number as parameter, so let's do the correct 
-		 * conversion and run away */
-		if (status & IPS_SRC_NAT_DONE)
-			hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
-		else if (status & IPS_DST_NAT_DONE)
-			hooknum = NF_IP_PRE_ROUTING;  /* IP_NAT_MANIP_DST */
-		else 
-			return -EINVAL; /* Missing NAT flags */
-
-		DEBUGP("NAT status: %lu\n", 
-		       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
-		
-		if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
-			return -EEXIST;
-		ip_nat_setup_info(ct, &range, hooknum);
-
-                DEBUGP("NAT status after setup_info: %lu\n",
-                       ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+		if (cda[CTA_NAT_DST-1]) {
+			if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct,
+						&range) < 0)
+				return -EINVAL;
+			if (ip_nat_initialized(ct,
+					       HOOK2MANIP(NF_IP_PRE_ROUTING)))
+				return -EEXIST;
+			ip_nat_setup_info(ct, &range, hooknum);
+		}
+		if (cda[CTA_NAT_SRC-1]) {
+			if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct,
+						&range) < 0)
+				return -EINVAL;
+			if (ip_nat_initialized(ct,
+					       HOOK2MANIP(NF_IP_POST_ROUTING)))
+				return -EEXIST;
+			ip_nat_setup_info(ct, &range, hooknum);
+		}
 #endif
 	}
 
@@ -1122,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	/* implicit 'else' */
 
 	/* we only allow nat config for new conntracks */
-	if (cda[CTA_NAT-1]) {
+	if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
-- 
cgit v1.2.3


From c0d4cfd96dd0cc0dbf49435898808b5553af4822 Mon Sep 17 00:00:00 2001
From: Jing Min Zhao <zhaojingmin@users.sourceforge.net>
Date: Mon, 29 May 2006 18:26:27 -0700
Subject: [NETFILTER]: H.323 helper: Add support for Call Forwarding

Signed-off-by: Jing Min Zhao <zhaojingmin@users.sourceforge.net>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h        |   1 +
 include/linux/netfilter_ipv4/ip_conntrack_h323.h   |   7 ++
 .../ip_conntrack_helper_h323_types.h               |   3 +-
 net/ipv4/netfilter/Kconfig                         |   8 +-
 net/ipv4/netfilter/ip_conntrack_helper_h323.c      | 112 +++++++++++++++++++++
 .../netfilter/ip_conntrack_helper_h323_types.c     |   6 +-
 net/ipv4/netfilter/ip_nat_helper_h323.c            |  77 ++++++++++++++
 7 files changed, 206 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 5473c01f69ea..17d7ef938a09 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -154,6 +154,7 @@ struct ip_conntrack_expect
 	unsigned int flags;
 
 #ifdef CONFIG_IP_NF_NAT_NEEDED
+	u_int32_t saved_ip;
 	/* This is the original per-proto part, used to map the
 	 * expected connection the way the recipient expects. */
 	union ip_conntrack_manip_proto saved_proto;
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_h323.h b/include/linux/netfilter_ipv4/ip_conntrack_h323.h
index eace86bd2adb..3cbff7379002 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_h323.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_h323.h
@@ -71,6 +71,13 @@ extern int (*nat_h245_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct,
 			     unsigned char **data, int dataoff,
 			     TransportAddress * addr, u_int16_t port,
 			     struct ip_conntrack_expect * exp);
+extern int (*nat_callforwarding_hook) (struct sk_buff ** pskb,
+				       struct ip_conntrack * ct,
+				       enum ip_conntrack_info ctinfo,
+				       unsigned char **data, int dataoff,
+				       TransportAddress * addr,
+				       u_int16_t port,
+				       struct ip_conntrack_expect * exp);
 extern int (*nat_q931_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct,
 			     enum ip_conntrack_info ctinfo,
 			     unsigned char **data, TransportAddress * addr,
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h b/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h
index cc98f7aa5abe..3d4a773799fc 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h
@@ -1,4 +1,4 @@
-/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006
+/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006
  *
  * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
  *
@@ -412,6 +412,7 @@ typedef struct Facility_UUIE {	/* SEQUENCE */
 		eFacility_UUIE_destinationInfo = (1 << 14),
 		eFacility_UUIE_h245SecurityMode = (1 << 13),
 	} options;
+	TransportAddress alternativeAddress;
 	FacilityReason reason;
 	TransportAddress h245Address;
 	Facility_UUIE_fastStart fastStart;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1540e227890f..f86aeda3a5a0 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -183,10 +183,10 @@ config IP_NF_H323
 	  With this module you can support H.323 on a connection tracking/NAT
 	  firewall.
 
-	  This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP
-	  and T.120 based data and applications including audio, video, FAX,
-	  chat, whiteboard, file transfer, etc. For more information, please
-	  see http://nath323.sourceforge.net/.
+	  This module supports RAS, Fast Start, H.245 Tunnelling, Call
+	  Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat,
+	  whiteboard, file transfer, etc. For more information, please
+	  visit http://nath323.sourceforge.net/.
 
 	  If you want to compile it as a module, say 'M' here and read
 	  Documentation/modules.txt.  If unsure, say 'N'.
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
index 518f581d39ec..3052468a6ab1 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
@@ -22,6 +22,8 @@
 #include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
 #include <linux/netfilter_ipv4/ip_conntrack_h323.h>
 #include <linux/moduleparam.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
 
 #if 0
 #define DEBUGP printk
@@ -38,6 +40,13 @@ static int gkrouted_only = 1;
 module_param(gkrouted_only, int, 0600);
 MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
 
+static char *internal_net = NULL;
+static u_int32_t internal_net_addr = 0;
+static u_int32_t internal_net_mask = 0;
+module_param(internal_net, charp, 0600);
+MODULE_PARM_DESC(internal_net, "specify your internal network using format "
+		 "address/mask. this is used by call forwarding support");
+
 /* Hooks for NAT */
 int (*set_h245_addr_hook) (struct sk_buff ** pskb,
 			   unsigned char **data, int dataoff,
@@ -77,6 +86,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb,
 		      unsigned char **data, int dataoff,
 		      TransportAddress * addr, u_int16_t port,
 		      struct ip_conntrack_expect * exp);
+int (*nat_callforwarding_hook) (struct sk_buff ** pskb,
+				struct ip_conntrack * ct,
+				enum ip_conntrack_info ctinfo,
+				unsigned char **data, int dataoff,
+				TransportAddress * addr, u_int16_t port,
+				struct ip_conntrack_expect * exp);
 int (*nat_q931_hook) (struct sk_buff ** pskb,
 		      struct ip_conntrack * ct,
 		      enum ip_conntrack_info ctinfo,
@@ -683,6 +698,76 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
 	return ret;
 }
 
+/* Forwarding declaration */
+void ip_conntrack_q931_expect(struct ip_conntrack *new,
+			      struct ip_conntrack_expect *this);
+
+/****************************************************************************/
+static int expect_callforwarding(struct sk_buff **pskb,
+				 struct ip_conntrack *ct,
+				 enum ip_conntrack_info ctinfo,
+				 unsigned char **data, int dataoff,
+				 TransportAddress * addr)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	u_int32_t ip;
+	u_int16_t port;
+	struct ip_conntrack_expect *exp = NULL;
+
+	/* Read alternativeAddress */
+	if (!get_h225_addr(*data, addr, &ip, &port) || port == 0)
+		return 0;
+
+	/* If the calling party is on the same side of the forward-to party,
+	 * we don't need to track the second call */
+	if (internal_net &&
+	    ((ip & internal_net_mask) == internal_net_addr) ==
+	    ((ct->tuplehash[!dir].tuple.src.ip & internal_net_mask) ==
+	     internal_net_addr)) {
+		DEBUGP("ip_ct_q931: Call Forwarding not tracked\n");
+		return 0;
+	}
+
+	/* Create expect for the second call leg */
+	if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
+		return -1;
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.src.u.tcp.port = 0;
+	exp->tuple.dst.ip = ip;
+	exp->tuple.dst.u.tcp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_TCP;
+	exp->mask.src.ip = 0xFFFFFFFF;
+	exp->mask.src.u.tcp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.tcp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+	exp->flags = 0;
+
+	if (ct->tuplehash[dir].tuple.src.ip !=
+	    ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) {
+		/* Need NAT */
+		ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff,
+					      addr, port, exp);
+	} else {		/* Conntrack only */
+		exp->expectfn = ip_conntrack_q931_expect;
+
+		if (ip_conntrack_expect_related(exp) == 0) {
+			DEBUGP("ip_ct_q931: expect Call Forwarding "
+			       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+			       NIPQUAD(exp->tuple.src.ip),
+			       ntohs(exp->tuple.src.u.tcp.port),
+			       NIPQUAD(exp->tuple.dst.ip),
+			       ntohs(exp->tuple.dst.u.tcp.port));
+		} else
+			ret = -1;
+	}
+
+	ip_conntrack_expect_put(exp);
+
+	return ret;
+}
+
 /****************************************************************************/
 static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct,
 			 enum ip_conntrack_info ctinfo,
@@ -878,6 +963,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct,
 
 	DEBUGP("ip_ct_q931: Facility\n");
 
+	if (facility->reason.choice == eFacilityReason_callForwarded) {
+		if (facility->options & eFacility_UUIE_alternativeAddress)
+			return expect_callforwarding(pskb, ct, ctinfo, data,
+						     dataoff,
+						     &facility->
+						     alternativeAddress);
+		return 0;
+	}
+
 	if (facility->options & eFacility_UUIE_h245Address) {
 		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
 				  &facility->h245Address);
@@ -1668,6 +1762,7 @@ static void fini(void)
 static int __init init(void)
 {
 	int ret;
+	char *p;
 
 	h323_buffer = kmalloc(65536, GFP_KERNEL);
 	if (!h323_buffer)
@@ -1678,6 +1773,22 @@ static int __init init(void)
 		return ret;
 	}
 
+	if (internal_net) {
+		if ((p = strchr(internal_net, '/')))
+			*p++ = 0;
+		if (isdigit(internal_net[0])) {
+			internal_net_addr = in_aton(internal_net);
+			if (p && isdigit(p[0]))
+				internal_net_mask = in_aton(p);
+			else
+				internal_net_mask = 0xffffffff;
+			internal_net_addr &= internal_net_mask;
+		}
+		DEBUGP("ip_ct_h323: internal_net = %u.%u.%u.%u/%u.%u.%u.%u\n",
+		       NIPQUAD(internal_net_addr),
+		       NIPQUAD(internal_net_mask));
+	}
+
 	DEBUGP("ip_ct_h323: init success\n");
 	return 0;
 }
@@ -1696,6 +1807,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook);
 EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
 EXPORT_SYMBOL_GPL(nat_t120_hook);
 EXPORT_SYMBOL_GPL(nat_h245_hook);
+EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
 EXPORT_SYMBOL_GPL(nat_q931_hook);
 
 MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
index 022c47b9f6c9..4b359618bedd 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
@@ -1,4 +1,4 @@
-/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006
+/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006
  *
  * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
  *
@@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = {	/* SEQUENCE OF */
 
 static field_t _Facility_UUIE[] = {	/* SEQUENCE */
 	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
-	{FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0,
-	 _TransportAddress},
+	{FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Facility_UUIE, alternativeAddress), _TransportAddress},
 	{FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
 	 _Facility_UUIE_alternativeAliasAddress},
 	{FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c
index d45663d137a7..419b878fb467 100644
--- a/net/ipv4/netfilter/ip_nat_helper_h323.c
+++ b/net/ipv4/netfilter/ip_nat_helper_h323.c
@@ -486,6 +486,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
 	return 0;
 }
 
+/****************************************************************************/
+static void ip_nat_callforwarding_expect(struct ip_conntrack *new,
+					 struct ip_conntrack_expect *this)
+{
+	struct ip_nat_range range;
+
+	/* This must be a fresh one. */
+	BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = IP_NAT_RANGE_MAP_IPS;
+	range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip;
+
+	/* hook doesn't matter, but it has to do source manip */
+	ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+	range.min = range.max = this->saved_proto;
+	range.min_ip = range.max_ip = this->saved_ip;
+
+	/* hook doesn't matter, but it has to do destination manip */
+	ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
+
+	ip_conntrack_q931_expect(new, this);
+}
+
+/****************************************************************************/
+static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct,
+			      enum ip_conntrack_info ctinfo,
+			      unsigned char **data, int dataoff,
+			      TransportAddress * addr, u_int16_t port,
+			      struct ip_conntrack_expect *exp)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port;
+
+	/* Set expectations for NAT */
+	exp->saved_ip = exp->tuple.dst.ip;
+	exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = ip_nat_callforwarding_expect;
+	exp->dir = !dir;
+
+	/* Try to get same port: if not, try to change it. */
+	for (nated_port = port; nated_port != 0; nated_port++) {
+		exp->tuple.dst.u.tcp.port = htons(nated_port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			printk("ip_nat_q931: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (!set_h225_addr(pskb, data, dataoff, addr,
+			   ct->tuplehash[!dir].tuple.dst.ip,
+			   nated_port) == 0) {
+		ip_conntrack_unexpect_related(exp);
+		return -1;
+	}
+
+	/* Success */
+	DEBUGP("ip_nat_q931: expect Call Forwarding "
+	       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+	       NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
+	       NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
 /****************************************************************************/
 static int __init init(void)
 {
@@ -496,6 +570,7 @@ static int __init init(void)
 	BUG_ON(nat_rtp_rtcp_hook != NULL);
 	BUG_ON(nat_t120_hook != NULL);
 	BUG_ON(nat_h245_hook != NULL);
+	BUG_ON(nat_callforwarding_hook != NULL);
 	BUG_ON(nat_q931_hook != NULL);
 
 	set_h245_addr_hook = set_h245_addr;
@@ -505,6 +580,7 @@ static int __init init(void)
 	nat_rtp_rtcp_hook = nat_rtp_rtcp;
 	nat_t120_hook = nat_t120;
 	nat_h245_hook = nat_h245;
+	nat_callforwarding_hook = nat_callforwarding;
 	nat_q931_hook = nat_q931;
 
 	DEBUGP("ip_nat_h323: init success\n");
@@ -521,6 +597,7 @@ static void __exit fini(void)
 	nat_rtp_rtcp_hook = NULL;
 	nat_t120_hook = NULL;
 	nat_h245_hook = NULL;
+	nat_callforwarding_hook = NULL;
 	nat_q931_hook = NULL;
 	synchronize_net();
 }
-- 
cgit v1.2.3


From ae5b7d8ba2c28d7d9835856fe0ca5f6ec95ea768 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 29 May 2006 18:27:09 -0700
Subject: [NETFILTER]: Add SIP connection tracking helper

Add SIP connection tracking helper. Originally written by
Christian Hentschel <chentschel@arnet.com.ar>, some cleanup, minor
fixes and bidirectional SIP support added by myself.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack_sip.h |  44 +++
 net/ipv4/netfilter/Kconfig                      |  18 +
 net/ipv4/netfilter/Makefile                     |   2 +
 net/ipv4/netfilter/ip_conntrack_sip.c           | 471 ++++++++++++++++++++++++
 net/ipv4/netfilter/ip_nat_sip.c                 | 249 +++++++++++++
 5 files changed, 784 insertions(+)
 create mode 100644 include/linux/netfilter_ipv4/ip_conntrack_sip.h
 create mode 100644 net/ipv4/netfilter/ip_conntrack_sip.c
 create mode 100644 net/ipv4/netfilter/ip_nat_sip.c

(limited to 'include')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack_sip.h b/include/linux/netfilter_ipv4/ip_conntrack_sip.h
new file mode 100644
index 000000000000..913dad66c0fb
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ip_conntrack_sip.h
@@ -0,0 +1,44 @@
+#ifndef __IP_CONNTRACK_SIP_H__
+#define __IP_CONNTRACK_SIP_H__
+#ifdef __KERNEL__
+
+#define SIP_PORT	5060
+#define SIP_TIMEOUT	3600
+
+#define POS_VIA		0
+#define POS_CONTACT	1
+#define POS_CONTENT	2
+#define POS_MEDIA	3
+#define POS_OWNER	4
+#define POS_CONNECTION	5
+#define POS_REQ_HEADER	6
+#define POS_SDP_HEADER	7
+
+struct sip_header_nfo {
+	const char	*lname;
+	const char	*sname;
+	const char	*ln_str;
+	size_t		lnlen;
+	size_t		snlen;
+	size_t		ln_strlen;
+	int		(*match_len)(const char *, const char *, int *);
+};
+
+extern unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb,
+				       enum ip_conntrack_info ctinfo,
+				       struct ip_conntrack *ct,
+				       const char **dptr);
+extern unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb,
+				       enum ip_conntrack_info ctinfo,
+				       struct ip_conntrack_expect *exp,
+				       const char *dptr);
+
+extern int ct_sip_get_info(const char *dptr, size_t dlen,
+			   unsigned int *matchoff,
+			   unsigned int *matchlen,
+			   struct sip_header_nfo *hnfo);
+extern int ct_sip_lnlen(const char *line, const char *limit);
+extern const char *ct_sip_search(const char *needle, const char *haystack,
+                                 size_t needle_len, size_t haystack_len);
+#endif /* __KERNEL__ */
+#endif /* __IP_CONNTRACK_SIP_H__ */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index f86aeda3a5a0..ff4b118f14a9 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -191,6 +191,18 @@ config IP_NF_H323
 	  If you want to compile it as a module, say 'M' here and read
 	  Documentation/modules.txt.  If unsure, say 'N'.
 
+config IP_NF_SIP
+	tristate "SIP protocol support (EXPERIMENTAL)"
+	depends on IP_NF_CONNTRACK && EXPERIMENTAL
+	help
+	  SIP is an application-layer control protocol that can establish,
+	  modify, and terminate multimedia sessions (conferences) such as
+	  Internet telephony calls. With the ip_conntrack_sip and
+	  the ip_nat_sip modules you can support the protocol on a connection
+	  tracking/NATing firewall.
+
+	  To compile it as a module, choose M here.  If unsure, say Y.
+
 config IP_NF_QUEUE
 	tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
 	help
@@ -503,6 +515,12 @@ config IP_NF_NAT_H323
 	default IP_NF_NAT if IP_NF_H323=y
 	default m if IP_NF_H323=m
 
+config IP_NF_NAT_SIP
+	tristate
+	depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+	default IP_NF_NAT if IP_NF_SIP=y
+	default m if IP_NF_SIP=m
+
 # mangle + specific targets
 config IP_NF_MANGLE
 	tristate "Packet mangling"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 461cb1eb5de7..3ded4a3af59c 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
 obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
 obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
 obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o
 obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
 
 # NAT helpers 
@@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
 obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
 obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
 obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
+obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o
 
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c
new file mode 100644
index 000000000000..fc87ce0da40d
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_sip.c
@@ -0,0 +1,471 @@
+/* SIP extension for IP connection tracking.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_conntrack_ftp.c and other modules.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP connection tracking helper");
+
+#define MAX_PORTS	8
+static unsigned short ports[MAX_PORTS];
+static int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of sip servers");
+
+static unsigned int sip_timeout = SIP_TIMEOUT;
+module_param(sip_timeout, uint, 0600);
+MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session");
+
+unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb,
+				enum ip_conntrack_info ctinfo,
+				struct ip_conntrack *ct,
+				const char **dptr);
+EXPORT_SYMBOL_GPL(ip_nat_sip_hook);
+
+unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb,
+				enum ip_conntrack_info ctinfo,
+				struct ip_conntrack_expect *exp,
+				const char *dptr);
+EXPORT_SYMBOL_GPL(ip_nat_sdp_hook);
+
+int ct_sip_get_info(const char *dptr, size_t dlen,
+				unsigned int *matchoff,
+				unsigned int *matchlen,
+				struct sip_header_nfo *hnfo);
+EXPORT_SYMBOL_GPL(ct_sip_get_info);
+
+
+static int digits_len(const char *dptr, const char *limit, int *shift);
+static int epaddr_len(const char *dptr, const char *limit, int *shift);
+static int skp_digits_len(const char *dptr, const char *limit, int *shift);
+static int skp_epaddr_len(const char *dptr, const char *limit, int *shift);
+
+struct sip_header_nfo ct_sip_hdrs[] = {
+	{ 	/* Via header */
+		.lname		= "Via:",
+		.lnlen		= sizeof("Via:") - 1,
+		.sname		= "\r\nv:",
+		.snlen		= sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */
+		.ln_str		= "UDP ",
+		.ln_strlen	= sizeof("UDP ") - 1,
+		.match_len	= epaddr_len,
+	},
+	{ 	/* Contact header */
+		.lname		= "Contact:",
+		.lnlen		= sizeof("Contact:") - 1,
+		.sname		= "\r\nm:",
+		.snlen		= sizeof("\r\nm:") - 1,
+		.ln_str		= "sip:",
+		.ln_strlen	= sizeof("sip:") - 1,
+		.match_len	= skp_epaddr_len
+	},
+	{ 	/* Content length header */
+		.lname		= "Content-Length:",
+		.lnlen		= sizeof("Content-Length:") - 1,
+		.sname		= "\r\nl:",
+		.snlen		= sizeof("\r\nl:") - 1,
+		.ln_str		= ":",
+		.ln_strlen	= sizeof(":") - 1,
+		.match_len	= skp_digits_len
+	},
+	{	/* SDP media info */
+		.lname		= "\nm=",
+		.lnlen		= sizeof("\nm=") - 1,
+		.sname		= "\rm=",
+		.snlen		= sizeof("\rm=") - 1,
+		.ln_str		= "audio ",
+		.ln_strlen	= sizeof("audio ") - 1,
+		.match_len	= digits_len
+	},
+	{ 	/* SDP owner address*/
+		.lname		= "\no=",
+		.lnlen		= sizeof("\no=") - 1,
+		.sname		= "\ro=",
+		.snlen		= sizeof("\ro=") - 1,
+		.ln_str		= "IN IP4 ",
+		.ln_strlen	= sizeof("IN IP4 ") - 1,
+		.match_len	= epaddr_len
+	},
+	{ 	/* SDP connection info */
+		.lname		= "\nc=",
+		.lnlen		= sizeof("\nc=") - 1,
+		.sname		= "\rc=",
+		.snlen		= sizeof("\rc=") - 1,
+		.ln_str		= "IN IP4 ",
+		.ln_strlen	= sizeof("IN IP4 ") - 1,
+		.match_len	= epaddr_len
+	},
+	{ 	/* Requests headers */
+		.lname		= "sip:",
+		.lnlen		= sizeof("sip:") - 1,
+		.sname		= "sip:",
+		.snlen		= sizeof("sip:") - 1, /* yes, i know.. ;) */
+		.ln_str		= "@",
+		.ln_strlen	= sizeof("@") - 1,
+		.match_len	= epaddr_len
+	},
+	{ 	/* SDP version header */
+		.lname		= "\nv=",
+		.lnlen		= sizeof("\nv=") - 1,
+		.sname		= "\rv=",
+		.snlen		= sizeof("\rv=") - 1,
+		.ln_str		= "=",
+		.ln_strlen	= sizeof("=") - 1,
+		.match_len	= digits_len
+	}
+};
+EXPORT_SYMBOL_GPL(ct_sip_hdrs);
+
+/* get line lenght until first CR or LF seen. */
+int ct_sip_lnlen(const char *line, const char *limit)
+{
+	const char *k = line;
+
+	while ((line <= limit) && (*line == '\r' || *line == '\n'))
+		line++;
+
+	while (line <= limit) {
+		if (*line == '\r' || *line == '\n')
+			break;
+		line++;
+	}
+	return line - k;
+}
+EXPORT_SYMBOL_GPL(ct_sip_lnlen);
+
+/* Linear string search, case sensitive. */
+const char *ct_sip_search(const char *needle, const char *haystack,
+                          size_t needle_len, size_t haystack_len)
+{
+	const char *limit = haystack + (haystack_len - needle_len);
+
+	while (haystack <= limit) {
+		if (memcmp(haystack, needle, needle_len) == 0)
+			return haystack;
+		haystack++;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ct_sip_search);
+
+static int digits_len(const char *dptr, const char *limit, int *shift)
+{
+	int len = 0;
+	while (dptr <= limit && isdigit(*dptr)) {
+		dptr++;
+		len++;
+	}
+	return len;
+}
+
+/* get digits lenght, skiping blank spaces. */
+static int skp_digits_len(const char *dptr, const char *limit, int *shift)
+{
+	for (; dptr <= limit && *dptr == ' '; dptr++)
+		(*shift)++;
+
+	return digits_len(dptr, limit, shift);
+}
+
+/* Simple ipaddr parser.. */
+static int parse_ipaddr(const char *cp,	const char **endp,
+			u_int32_t *ipaddr, const char *limit)
+{
+	unsigned long int val;
+	int i, digit = 0;
+
+	for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) {
+		digit = 0;
+		if (!isdigit(*cp))
+			break;
+
+		val = simple_strtoul(cp, (char **)&cp, 10);
+		if (val > 0xFF)
+			return -1;
+
+		((u_int8_t *)ipaddr)[i] = val;
+		digit = 1;
+
+		if (*cp != '.')
+			break;
+		cp++;
+	}
+	if (!digit)
+		return -1;
+
+	if (endp)
+		*endp = cp;
+
+	return 0;
+}
+
+/* skip ip address. returns it lenght. */
+static int epaddr_len(const char *dptr, const char *limit, int *shift)
+{
+	const char *aux = dptr;
+	u_int32_t ip;
+
+	if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) {
+		DEBUGP("ip: %s parse failed.!\n", dptr);
+		return 0;
+	}
+
+	/* Port number */
+	if (*dptr == ':') {
+		dptr++;
+		dptr += digits_len(dptr, limit, shift);
+	}
+	return dptr - aux;
+}
+
+/* get address length, skiping user info. */
+static int skp_epaddr_len(const char *dptr, const char *limit, int *shift)
+{
+	int s = *shift;
+
+	for (; dptr <= limit && *dptr != '@'; dptr++)
+		(*shift)++;
+
+	if (*dptr == '@') {
+		dptr++;
+		(*shift)++;
+	} else
+		*shift = s;
+
+	return epaddr_len(dptr, limit, shift);
+}
+
+/* Returns 0 if not found, -1 error parsing. */
+int ct_sip_get_info(const char *dptr, size_t dlen,
+		    unsigned int *matchoff,
+		    unsigned int *matchlen,
+		    struct sip_header_nfo *hnfo)
+{
+	const char *limit, *aux, *k = dptr;
+	int shift = 0;
+
+	limit = dptr + (dlen - hnfo->lnlen);
+
+	while (dptr <= limit) {
+		if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) &&
+		    (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) {
+			dptr++;
+			continue;
+		}
+		aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen,
+		                    ct_sip_lnlen(dptr, limit));
+		if (!aux) {
+			DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str,
+			       hnfo->lname);
+			return -1;
+		}
+		aux += hnfo->ln_strlen;
+
+		*matchlen = hnfo->match_len(aux, limit, &shift);
+		if (!*matchlen)
+			return -1;
+
+		*matchoff = (aux - k) + shift;
+
+		DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname,
+		       *matchlen);
+		return 1;
+	}
+	DEBUGP("%s header not found.\n", hnfo->lname);
+	return 0;
+}
+
+static int set_expected_rtp(struct sk_buff **pskb,
+			    struct ip_conntrack *ct,
+			    enum ip_conntrack_info ctinfo,
+			    u_int32_t ipaddr, u_int16_t port,
+			    const char *dptr)
+{
+	struct ip_conntrack_expect *exp;
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	int ret;
+
+	exp = ip_conntrack_expect_alloc(ct);
+	if (exp == NULL)
+		return NF_DROP;
+
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.src.u.udp.port = 0;
+	exp->tuple.dst.ip = ipaddr;
+	exp->tuple.dst.u.udp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_UDP;
+
+	exp->mask.src.ip = 0xFFFFFFFF;
+	exp->mask.src.u.udp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.udp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+
+	exp->expectfn = NULL;
+	exp->flags = 0;
+
+	if (ip_nat_sdp_hook)
+		ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr);
+	else {
+		if (ip_conntrack_expect_related(exp) != 0)
+			ret = NF_DROP;
+		else
+			ret = NF_ACCEPT;
+	}
+	ip_conntrack_expect_put(exp);
+
+	return ret;
+}
+
+static int sip_help(struct sk_buff **pskb,
+		    struct ip_conntrack *ct,
+		    enum ip_conntrack_info ctinfo)
+{
+	unsigned int dataoff, datalen;
+	const char *dptr;
+	int ret = NF_ACCEPT;
+	int matchoff, matchlen;
+	u_int32_t ipaddr;
+	u_int16_t port;
+
+	/* No Data ? */
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+	if (dataoff >= (*pskb)->len) {
+		DEBUGP("skb->len = %u\n", (*pskb)->len);
+		return NF_ACCEPT;
+        }
+
+	ip_ct_refresh(ct, *pskb, sip_timeout * HZ);
+
+	if (!skb_is_nonlinear(*pskb))
+		dptr = (*pskb)->data + dataoff;
+	else {
+		DEBUGP("Copy of skbuff not supported yet.\n");
+		goto out;
+	}
+
+	if (ip_nat_sip_hook) {
+		if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) {
+			ret = NF_DROP;
+			goto out;
+		}
+	}
+
+	/* After this point NAT, could have mangled skb, so
+	   we need to recalculate payload lenght. */
+	datalen = (*pskb)->len - dataoff;
+
+	if (datalen < (sizeof("SIP/2.0 200") - 1))
+		goto out;
+
+	/* RTP info only in some SDP pkts */
+	if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 &&
+	    memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) {
+		goto out;
+	}
+	/* Get ip and port address from SDP packet. */
+	if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
+	                    &ct_sip_hdrs[POS_CONNECTION]) > 0) {
+
+		/* We'll drop only if there are parse problems. */
+		if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr,
+		                 dptr + datalen) < 0) {
+			ret = NF_DROP;
+			goto out;
+		}
+		if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen,
+		                    &ct_sip_hdrs[POS_MEDIA]) > 0) {
+
+			port = simple_strtoul(dptr + matchoff, NULL, 10);
+			if (port < 1024) {
+				ret = NF_DROP;
+				goto out;
+			}
+			ret = set_expected_rtp(pskb, ct, ctinfo,
+					       ipaddr, port, dptr);
+		}
+	}
+out:
+	return ret;
+}
+
+static struct ip_conntrack_helper sip[MAX_PORTS];
+static char sip_names[MAX_PORTS][10];
+
+static void fini(void)
+{
+	int i;
+	for (i = 0; i < ports_c; i++) {
+		DEBUGP("unregistering helper for port %d\n", ports[i]);
+		ip_conntrack_helper_unregister(&sip[i]);
+	}
+}
+
+static int __init init(void)
+{
+	int i, ret;
+	char *tmpname;
+
+	if (ports_c == 0)
+		ports[ports_c++] = SIP_PORT;
+
+	for (i = 0; i < ports_c; i++) {
+		/* Create helper structure */
+		memset(&sip[i], 0, sizeof(struct ip_conntrack_helper));
+
+		sip[i].tuple.dst.protonum = IPPROTO_UDP;
+		sip[i].tuple.src.u.udp.port = htons(ports[i]);
+		sip[i].mask.src.u.udp.port = 0xFFFF;
+		sip[i].mask.dst.protonum = 0xFF;
+		sip[i].max_expected = 1;
+		sip[i].timeout = 3 * 60; /* 3 minutes */
+		sip[i].me = THIS_MODULE;
+		sip[i].help = sip_help;
+
+		tmpname = &sip_names[i][0];
+		if (ports[i] == SIP_PORT)
+			sprintf(tmpname, "sip");
+		else
+			sprintf(tmpname, "sip-%d", i);
+		sip[i].name = tmpname;
+
+		DEBUGP("port #%d: %d\n", i, ports[i]);
+
+		ret = ip_conntrack_helper_register(&sip[i]);
+		if (ret) {
+			printk("ERROR registering helper for port %d\n",
+				ports[i]);
+			fini();
+			return ret;
+		}
+	}
+	return 0;
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c
new file mode 100644
index 000000000000..6ffba63adca2
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_sip.c
@@ -0,0 +1,249 @@
+/* SIP extension for UDP NAT alteration.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_nat_ftp.c and other modules.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_sip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP NAT helper");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+extern struct sip_header_nfo ct_sip_hdrs[];
+
+static unsigned int mangle_sip_packet(struct sk_buff **pskb,
+				      enum ip_conntrack_info ctinfo,
+				      struct ip_conntrack *ct,
+				      const char **dptr, size_t dlen,
+				      char *buffer, int bufflen,
+				      struct sip_header_nfo *hnfo)
+{
+	unsigned int matchlen, matchoff;
+
+	if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0)
+		return 0;
+
+	if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
+	                              matchoff, matchlen, buffer, bufflen))
+		return 0;
+
+	/* We need to reload this. Thanks Patrick. */
+	*dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+	return 1;
+}
+
+static unsigned int ip_nat_sip(struct sk_buff **pskb,
+			       enum ip_conntrack_info ctinfo,
+			       struct ip_conntrack *ct,
+			       const char **dptr)
+{
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
+	unsigned int bufflen, dataoff;
+	u_int32_t ip;
+	u_int16_t port;
+
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+
+	ip   = ct->tuplehash[!dir].tuple.dst.ip;
+	port = ct->tuplehash[!dir].tuple.dst.u.udp.port;
+	bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port));
+
+	/* short packet ? */
+	if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1))
+		return 0;
+
+	/* Basic rules: requests and responses. */
+	if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) {
+		const char *aux;
+
+		if ((ctinfo) < IP_CT_IS_REPLY) {
+			mangle_sip_packet(pskb, ctinfo, ct, dptr,
+			                  (*pskb)->len - dataoff,
+			                  buffer, bufflen,
+			                  &ct_sip_hdrs[POS_CONTACT]);
+			return 1;
+		}
+
+		if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
+				       (*pskb)->len - dataoff,
+		                       buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
+			return 0;
+
+		/* This search should ignore case, but later.. */
+		aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1,
+		                    (*pskb)->len - dataoff);
+		if (!aux)
+			return 0;
+
+		if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"),
+		    ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff)))
+			return 1;
+
+		return mangle_sip_packet(pskb, ctinfo, ct, dptr,
+					 (*pskb)->len - dataoff,
+		                         buffer, bufflen,
+					 &ct_sip_hdrs[POS_CONTACT]);
+	}
+	if ((ctinfo) < IP_CT_IS_REPLY) {
+		if (!mangle_sip_packet(pskb, ctinfo, ct, dptr,
+				       (*pskb)->len - dataoff,
+		                       buffer, bufflen, &ct_sip_hdrs[POS_VIA]))
+			return 0;
+
+		/* Mangle Contact if exists only. - watch udp_nat_mangle()! */
+		mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff,
+		                  buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]);
+		return 1;
+	}
+	/* This mangle requests headers. */
+	return mangle_sip_packet(pskb, ctinfo, ct, dptr,
+	                         ct_sip_lnlen(*dptr,
+				              *dptr + (*pskb)->len - dataoff),
+	                         buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]);
+}
+
+static int mangle_content_len(struct sk_buff **pskb,
+			      enum ip_conntrack_info ctinfo,
+			      struct ip_conntrack *ct,
+			      const char *dptr)
+{
+	unsigned int dataoff, matchoff, matchlen;
+	char buffer[sizeof("65536")];
+	int bufflen;
+
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+
+	/* Get actual SDP lenght */
+	if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
+	                    &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) {
+
+		/* since ct_sip_get_info() give us a pointer passing 'v='
+		   we need to add 2 bytes in this count. */
+		int c_len = (*pskb)->len - dataoff - matchoff + 2;
+
+		/* Now, update SDP lenght */
+		if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff,
+		                    &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) {
+
+			bufflen = sprintf(buffer, "%u", c_len);
+
+			return ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
+							matchoff, matchlen,
+							buffer, bufflen);
+		}
+	}
+	return 0;
+}
+
+static unsigned int mangle_sdp(struct sk_buff **pskb,
+			       enum ip_conntrack_info ctinfo,
+			       struct ip_conntrack *ct,
+			       u_int32_t newip, u_int16_t port,
+			       const char *dptr)
+{
+	char buffer[sizeof("nnn.nnn.nnn.nnn")];
+	unsigned int dataoff, bufflen;
+
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+
+	/* Mangle owner and contact info. */
+	bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip));
+	if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+	                       buffer, bufflen, &ct_sip_hdrs[POS_OWNER]))
+		return 0;
+
+	if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+	                       buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION]))
+		return 0;
+
+	/* Mangle media port. */
+	bufflen = sprintf(buffer, "%u", port);
+	if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff,
+	                       buffer, bufflen, &ct_sip_hdrs[POS_MEDIA]))
+		return 0;
+
+	return mangle_content_len(pskb, ctinfo, ct, dptr);
+}
+
+/* So, this packet has hit the connection tracking matching code.
+   Mangle it, and change the expectation to match the new version. */
+static unsigned int ip_nat_sdp(struct sk_buff **pskb,
+			       enum ip_conntrack_info ctinfo,
+			       struct ip_conntrack_expect *exp,
+			       const char *dptr)
+{
+	struct ip_conntrack *ct = exp->master;
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	u_int32_t newip;
+	u_int16_t port;
+
+	DEBUGP("ip_nat_sdp():\n");
+
+	/* Connection will come from reply */
+	newip = ct->tuplehash[!dir].tuple.dst.ip;
+
+	exp->tuple.dst.ip = newip;
+	exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
+	exp->dir = !dir;
+
+	/* When you see the packet, we need to NAT it the same as the
+	   this one. */
+	exp->expectfn = ip_nat_follow_master;
+
+	/* Try to get same port: if not, try to change it. */
+	for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) {
+		exp->tuple.dst.u.udp.port = htons(port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (port == 0)
+		return NF_DROP;
+
+	if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) {
+		ip_conntrack_unexpect_related(exp);
+		return NF_DROP;
+	}
+	return NF_ACCEPT;
+}
+
+static void __exit fini(void)
+{
+	ip_nat_sip_hook = NULL;
+	ip_nat_sdp_hook = NULL;
+	/* Make sure noone calls it, meanwhile. */
+	synchronize_net();
+}
+
+static int __init init(void)
+{
+	BUG_ON(ip_nat_sip_hook);
+	BUG_ON(ip_nat_sdp_hook);
+	ip_nat_sip_hook = ip_nat_sip;
+	ip_nat_sdp_hook = ip_nat_sdp;
+	return 0;
+}
+
+module_init(init);
+module_exit(fini);
-- 
cgit v1.2.3


From 72dc5b9225c53310c010b68a70ea97c8c8e24bdf Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Mon, 5 Jun 2006 17:30:08 -0700
Subject: [TCP]: Minimum congestion window consolidation.

Many of the TCP congestion methods all just use ssthresh
as the minimum congestion window on decrease.  Rather than
duplicating the code, just have that be the default if that
handle in the ops structure is not set.

Minor behaviour change to TCP compound.  It probably wants
to use this (ssthresh) as lower bound, rather than ssthresh/2
because the latter causes undershoot on loss.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  4 ++--
 net/ipv4/tcp_bic.c      |  7 -------
 net/ipv4/tcp_compound.c |  1 -
 net/ipv4/tcp_cong.c     |  6 +++---
 net/ipv4/tcp_cubic.c    |  6 ------
 net/ipv4/tcp_htcp.c     |  9 ---------
 net/ipv4/tcp_input.c    | 13 +++++++++++--
 net/ipv4/tcp_veno.c     |  7 -------
 net/ipv4/tcp_westwood.c | 18 +++++++-----------
 9 files changed, 23 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f1f472746e6c..de88c5472bfc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -632,7 +632,7 @@ struct tcp_congestion_ops {
 	/* return slow start threshold (required) */
 	u32 (*ssthresh)(struct sock *sk);
 	/* lower bound for congestion window (optional) */
-	u32 (*min_cwnd)(struct sock *sk);
+	u32 (*min_cwnd)(const struct sock *sk);
 	/* do new cwnd calculation (required) */
 	void (*cong_avoid)(struct sock *sk, u32 ack,
 			   u32 rtt, u32 in_flight, int good_ack);
@@ -667,7 +667,7 @@ extern struct tcp_congestion_ops tcp_init_congestion_ops;
 extern u32 tcp_reno_ssthresh(struct sock *sk);
 extern void tcp_reno_cong_avoid(struct sock *sk, u32 ack,
 				u32 rtt, u32 in_flight, int flag);
-extern u32 tcp_reno_min_cwnd(struct sock *sk);
+extern u32 tcp_reno_min_cwnd(const struct sock *sk);
 extern struct tcp_congestion_ops tcp_reno;
 
 static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 035f2092d73a..b2d9021ad22b 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
 	return max(tp->snd_cwnd, ca->last_max_cwnd);
 }
 
-static u32 bictcp_min_cwnd(struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	return tp->snd_ssthresh;
-}
-
 static void bictcp_state(struct sock *sk, u8 new_state)
 {
 	if (new_state == TCP_CA_Loss)
@@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = {
 	.cong_avoid	= bictcp_cong_avoid,
 	.set_state	= bictcp_state,
 	.undo_cwnd	= bictcp_undo_cwnd,
-	.min_cwnd	= bictcp_min_cwnd,
 	.pkts_acked     = bictcp_acked,
 	.owner		= THIS_MODULE,
 	.name		= "bic",
diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c
index ec68cb8081c1..bc54f7e9aea9 100644
--- a/net/ipv4/tcp_compound.c
+++ b/net/ipv4/tcp_compound.c
@@ -419,7 +419,6 @@ static struct tcp_congestion_ops tcp_compound = {
 	.init		= tcp_compound_init,
 	.ssthresh	= tcp_reno_ssthresh,
 	.cong_avoid	= tcp_compound_cong_avoid,
-	.min_cwnd	= tcp_reno_min_cwnd,
 	.rtt_sample	= tcp_compound_rtt_calc,
 	.set_state	= tcp_compound_state,
 	.cwnd_event	= tcp_compound_cwnd_event,
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 91c2f41c7f58..857eefc52aab 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
 	int ret = 0;
 
 	/* all algorithms must implement ssthresh and cong_avoid ops */
-	if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
+	if (!ca->ssthresh || !ca->cong_avoid) {
 		printk(KERN_ERR "TCP %s does not implement required ops\n",
 		       ca->name);
 		return -EINVAL;
@@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
 
-/* Lower bound on congestion window. */
-u32 tcp_reno_min_cwnd(struct sock *sk)
+/* Lower bound on congestion window with halving. */
+u32 tcp_reno_min_cwnd(const struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	return tp->snd_ssthresh/2;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 31a4986dfbf7..78b7a6b9e4de 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
 	return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
 }
 
-static u32 bictcp_min_cwnd(struct sock *sk)
-{
-	return tcp_sk(sk)->snd_ssthresh;
-}
-
 static void bictcp_state(struct sock *sk, u8 new_state)
 {
 	if (new_state == TCP_CA_Loss)
@@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = {
 	.cong_avoid	= bictcp_cong_avoid,
 	.set_state	= bictcp_state,
 	.undo_cwnd	= bictcp_undo_cwnd,
-	.min_cwnd	= bictcp_min_cwnd,
 	.pkts_acked     = bictcp_acked,
 	.owner		= THIS_MODULE,
 	.name		= "cubic",
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 1b2ff53f98ed..3d92c1859267 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
 	}
 }
 
-/* Lower bound on congestion window. */
-static u32 htcp_min_cwnd(struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	return tp->snd_ssthresh;
-}
-
-
 static void htcp_init(struct sock *sk)
 {
 	struct htcp *ca = inet_csk_ca(sk);
@@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state)
 static struct tcp_congestion_ops htcp = {
 	.init		= htcp_init,
 	.ssthresh	= htcp_recalc_ssthresh,
-	.min_cwnd	= htcp_min_cwnd,
 	.cong_avoid	= htcp_cong_avoid,
 	.set_state	= htcp_state,
 	.undo_cwnd	= htcp_cwnd_undo,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6d167889a4b0..e08245bdda3a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1689,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
+/* Lower bound on congestion window is slow start threshold
+ * unless congestion avoidance choice decides to overide it.
+ */
+static inline u32 tcp_cwnd_min(const struct sock *sk)
+{
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+
+	return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
+}
+
 /* Decrease cwnd each second ack. */
 static void tcp_cwnd_down(struct sock *sk)
 {
-	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int decr = tp->snd_cwnd_cnt + 1;
 
 	tp->snd_cwnd_cnt = decr&1;
 	decr >>= 1;
 
-	if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
+	if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
 		tp->snd_cwnd -= decr;
 
 	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 1091671751c4..11b42a7135c1 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -199,17 +199,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
 		return max(tp->snd_cwnd >> 1U, 2U);
 }
 
-static u32 tcp_veno_min_cwnd(struct sock * sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	return tp->snd_ssthresh;
-}
-
 static struct tcp_congestion_ops tcp_veno = {
 	.init		= tcp_veno_init,
 	.ssthresh	= tcp_veno_ssthresh,
 	.cong_avoid	= tcp_veno_cong_avoid,
-	.min_cwnd 	= tcp_veno_min_cwnd,
 	.rtt_sample	= tcp_veno_rtt_calc,
 	.set_state	= tcp_veno_state,
 	.cwnd_event	= tcp_veno_cwnd_event,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 0c340c3756c2..29eb258b6d82 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -162,12 +162,6 @@ static inline u32 westwood_acked_count(struct sock *sk)
 	return w->cumul_ack;
 }
 
-static inline u32 westwood_bw_rttmin(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	const struct westwood *w = inet_csk_ca(sk);
-	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
-}
 
 /*
  * TCP Westwood
@@ -175,9 +169,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk)
  * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
  * so avoids ever returning 0.
  */
-static u32 tcp_westwood_cwnd_min(struct sock *sk)
+static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
 {
-	return westwood_bw_rttmin(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct westwood *w = inet_csk_ca(sk);
+	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
 }
 
 static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
@@ -191,11 +187,11 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
 		break;
 
 	case CA_EVENT_COMPLETE_CWR:
-		tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
+		tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
 		break;
 
 	case CA_EVENT_FRTO:
-		tp->snd_ssthresh = westwood_bw_rttmin(sk);
+		tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
 		break;
 
 	case CA_EVENT_SLOW_ACK:
@@ -235,7 +231,7 @@ static struct tcp_congestion_ops tcp_westwood = {
 	.init		= tcp_westwood_init,
 	.ssthresh	= tcp_reno_ssthresh,
 	.cong_avoid	= tcp_reno_cong_avoid,
-	.min_cwnd	= tcp_westwood_cwnd_min,
+	.min_cwnd	= tcp_westwood_bw_rttmin,
 	.cwnd_event	= tcp_westwood_event,
 	.get_info	= tcp_westwood_info,
 	.pkts_acked	= tcp_westwood_pkts_acked,
-- 
cgit v1.2.3


From 338fcf9886df9ad2873772197a73a57818973316 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 5 Jun 2006 21:04:39 -0700
Subject: [IPV4] igmp: Fixup struct ip_mc_list::multiaddr type

All users except two expect 32-bit big-endian value. One is of

	->multiaddr = ->multiaddr

variety. And last one is "%08lX".

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h | 2 +-
 net/ipv4/igmp.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 28f4f3b36950..899c3d4776f3 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -169,7 +169,7 @@ struct ip_sf_list
 struct ip_mc_list
 {
 	struct in_device	*interface;
-	unsigned long		multiaddr;
+	__be32			multiaddr;
 	struct ip_sf_list	*sources;
 	struct ip_sf_list	*tomb;
 	unsigned int		sfmode;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d512239a1473..ab680c851aa2 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
 		}
 
 		seq_printf(seq,
-			   "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
+			   "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
 			   im->multiaddr, im->users,
 			   im->tm_running, im->tm_running ?
 			   jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
-- 
cgit v1.2.3


From 6d7416535097ed0943bdae8e69c14ba43061cab1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 5 Jun 2006 21:06:41 -0700
Subject: [IPV4]: Right prototype of __raw_v4_lookup()

All users pass 32-bit values as addresses and internally they're
compared with 32-bit entities. So, change "laddr" and "raddr" types to
__be32.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h | 2 +-
 net/ipv4/raw.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index e67b28a0248c..d83571fe4c69 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -36,7 +36,7 @@ extern rwlock_t raw_v4_lock;
 
 
 extern struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
-				    unsigned long raddr, unsigned long laddr,
+				    __be32 raddr, __be32 laddr,
 				    int dif);
 
 extern int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index fc2562415555..bd221ec3f81e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk)
 }
 
 struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
-			     unsigned long raddr, unsigned long laddr,
+			     __be32 raddr, __be32 laddr,
 			     int dif)
 {
 	struct hlist_node *node;
-- 
cgit v1.2.3


From c8c05a8eec6f1258f6d5cb71a44ee5dc1e989b63 Mon Sep 17 00:00:00 2001
From: Catherine Zhang <cxzhang@watson.ibm.com>
Date: Thu, 8 Jun 2006 23:39:49 -0700
Subject: [LSM-IPsec]: SELinux Authorize

This patch contains a fix for the previous patch that adds security
contexts to IPsec policies and security associations.  In the previous
patch, no authorization (besides the check for write permissions to
SAD and SPD) is required to delete IPsec policies and security
assocations with security contexts.  Thus a user authorized to change
SAD and SPD can bypass the IPsec policy authorization by simply
deleteing policies with security contexts.  To fix this security hole,
an additional authorization check is added for removing security
policies and security associations with security contexts.

Note that if no security context is supplied on add or present on
policy to be deleted, the SELinux module allows the change
unconditionally.  The hook is called on deletion when no context is
present, which we may want to change.  At present, I left it up to the
module.

LSM changes:

The patch adds two new LSM hooks: xfrm_policy_delete and
xfrm_state_delete.  The new hooks are necessary to authorize deletion
of IPsec policies that have security contexts.  The existing hooks
xfrm_policy_free and xfrm_state_free lack the context to do the
authorization, so I decided to split authorization of deletion and
memory management of security data, as is typical in the LSM
interface.

Use:

The new delete hooks are checked when xfrm_policy or xfrm_state are
deleted by either the xfrm_user interface (xfrm_get_policy,
xfrm_del_sa) or the pfkey interface (pfkey_spddelete, pfkey_delete).

SELinux changes:

The new policy_delete and state_delete functions are added.

Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com>
Signed-off-by: Trent Jaeger <tjaeger@cse.psu.edu>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h        | 40 ++++++++++++++++++++++++++++++++++------
 net/key/af_key.c                | 17 +++++++++++------
 net/xfrm/xfrm_user.c            | 19 ++++++++++++-------
 security/dummy.c                | 12 ++++++++++++
 security/selinux/hooks.c        |  2 ++
 security/selinux/include/xfrm.h |  2 ++
 security/selinux/xfrm.c         | 39 +++++++++++++++++++++++++++++++++++----
 7 files changed, 108 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 1bab48f6aeac..14c9bd050607 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -805,31 +805,37 @@ struct swap_info_struct;
  *	used by the XFRM system.
  *	@sec_ctx contains the security context information being provided by
  *	the user-level policy update program (e.g., setkey).
- *	Allocate a security structure to the xp->selector.security field.
+ *	Allocate a security structure to the xp->security field.
  *	The security field is initialized to NULL when the xfrm_policy is
  *	allocated.
  *	Return 0 if operation was successful (memory to allocate, legal context)
  * @xfrm_policy_clone_security:
  *	@old contains an existing xfrm_policy in the SPD.
  *	@new contains a new xfrm_policy being cloned from old.
- *	Allocate a security structure to the new->selector.security field
- *	that contains the information from the old->selector.security field.
+ *	Allocate a security structure to the new->security field
+ *	that contains the information from the old->security field.
  *	Return 0 if operation was successful (memory to allocate).
  * @xfrm_policy_free_security:
  *	@xp contains the xfrm_policy
- *	Deallocate xp->selector.security.
+ *	Deallocate xp->security.
+ * @xfrm_policy_delete_security:
+ *	@xp contains the xfrm_policy.
+ *	Authorize deletion of xp->security.
  * @xfrm_state_alloc_security:
  *	@x contains the xfrm_state being added to the Security Association
  *	Database by the XFRM system.
  *	@sec_ctx contains the security context information being provided by
  *	the user-level SA generation program (e.g., setkey or racoon).
- *	Allocate a security structure to the x->sel.security field.  The
+ *	Allocate a security structure to the x->security field.  The
  *	security field is initialized to NULL when the xfrm_state is
  *	allocated.
  *	Return 0 if operation was successful (memory to allocate, legal context).
  * @xfrm_state_free_security:
  *	@x contains the xfrm_state.
- *	Deallocate x>sel.security.
+ *	Deallocate x->security.
+ * @xfrm_state_delete_security:
+ *	@x contains the xfrm_state.
+ *	Authorize deletion of x->security.
  * @xfrm_policy_lookup:
  *	@xp contains the xfrm_policy for which the access control is being
  *	checked.
@@ -1298,8 +1304,10 @@ struct security_operations {
 	int (*xfrm_policy_alloc_security) (struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
 	int (*xfrm_policy_clone_security) (struct xfrm_policy *old, struct xfrm_policy *new);
 	void (*xfrm_policy_free_security) (struct xfrm_policy *xp);
+	int (*xfrm_policy_delete_security) (struct xfrm_policy *xp);
 	int (*xfrm_state_alloc_security) (struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
 	void (*xfrm_state_free_security) (struct xfrm_state *x);
+	int (*xfrm_state_delete_security) (struct xfrm_state *x);
 	int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 
@@ -2934,11 +2942,21 @@ static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
 	security_ops->xfrm_policy_free_security(xp);
 }
 
+static inline int security_xfrm_policy_delete(struct xfrm_policy *xp)
+{
+	return security_ops->xfrm_policy_delete_security(xp);
+}
+
 static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
 {
 	return security_ops->xfrm_state_alloc_security(x, sec_ctx);
 }
 
+static inline int security_xfrm_state_delete(struct xfrm_state *x)
+{
+	return security_ops->xfrm_state_delete_security(x);
+}
+
 static inline void security_xfrm_state_free(struct xfrm_state *x)
 {
 	security_ops->xfrm_state_free_security(x);
@@ -2963,6 +2981,11 @@ static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
 {
 }
 
+static inline int security_xfrm_policy_delete(struct xfrm_policy *xp)
+{
+	return 0;
+}
+
 static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
 {
 	return 0;
@@ -2972,6 +2995,11 @@ static inline void security_xfrm_state_free(struct xfrm_state *x)
 {
 }
 
+static inline int security_xfrm_state_delete(struct xfrm_policy *xp)
+{
+	return 0;
+}
+
 static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
 {
 	return 0;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 859582275cab..d5e2121ea207 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1454,21 +1454,23 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	if (x == NULL)
 		return -ESRCH;
 
+	if ((err = security_xfrm_state_delete(x)))
+		goto out;
+
 	if (xfrm_state_kern(x)) {
-		xfrm_state_put(x);
-		return -EPERM;
+		err = -EPERM;
+		goto out;
 	}
 	
 	err = xfrm_state_delete(x);
-	if (err < 0) {
-		xfrm_state_put(x);
-		return err;
-	}
+	if (err < 0)
+		goto out;
 
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
 	c.event = XFRM_MSG_DELSA;
 	km_state_notify(x, &c);
+out:
 	xfrm_state_put(x);
 
 	return err;
@@ -2274,11 +2276,14 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 
 	err = 0;
 
+	if ((err = security_xfrm_policy_delete(xp)))
+		goto out;
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
 	c.event = XFRM_MSG_DELPOLICY;
 	km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
 
+out:
 	xfrm_pol_put(xp);
 	return err;
 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 81d1005830f4..a3733d2db3ba 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -427,23 +427,25 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	if (x == NULL)
 		return -ESRCH;
 
+	if (err = security_xfrm_state_delete(x))
+		goto out;
+
 	if (xfrm_state_kern(x)) {
-		xfrm_state_put(x);
-		return -EPERM;
+		err = -EPERM;
+		goto out;
 	}
 
 	err = xfrm_state_delete(x);
-	if (err < 0) {
-		xfrm_state_put(x);
-		return err;
-	}
+	if (err < 0)
+		goto out;
 
 	c.seq = nlh->nlmsg_seq;
 	c.pid = nlh->nlmsg_pid;
 	c.event = nlh->nlmsg_type;
 	km_state_notify(x, &c);
-	xfrm_state_put(x);
 
+out:
+	xfrm_state_put(x);
 	return err;
 }
 
@@ -1055,6 +1057,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 					      MSG_DONTWAIT);
 		}
 	} else {
+		if (err = security_xfrm_policy_delete(xp))
+			goto out;
 		c.data.byid = p->index;
 		c.event = nlh->nlmsg_type;
 		c.seq = nlh->nlmsg_seq;
@@ -1064,6 +1068,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 
 	xfrm_pol_put(xp);
 
+out:
 	return err;
 }
 
diff --git a/security/dummy.c b/security/dummy.c
index 8ccccccc12ac..64f6da0f422e 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -810,6 +810,11 @@ static void dummy_xfrm_policy_free_security(struct xfrm_policy *xp)
 {
 }
 
+static int dummy_xfrm_policy_delete_security(struct xfrm_policy *xp)
+{
+	return 0;
+}
+
 static int dummy_xfrm_state_alloc_security(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
 {
 	return 0;
@@ -819,6 +824,11 @@ static void dummy_xfrm_state_free_security(struct xfrm_state *x)
 {
 }
 
+static int dummy_xfrm_state_delete_security(struct xfrm_state *x)
+{
+	return 0;
+}
+
 static int dummy_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
 {
 	return 0;
@@ -1024,8 +1034,10 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, xfrm_policy_alloc_security);
 	set_to_dummy_if_null(ops, xfrm_policy_clone_security);
 	set_to_dummy_if_null(ops, xfrm_policy_free_security);
+	set_to_dummy_if_null(ops, xfrm_policy_delete_security);
 	set_to_dummy_if_null(ops, xfrm_state_alloc_security);
 	set_to_dummy_if_null(ops, xfrm_state_free_security);
+	set_to_dummy_if_null(ops, xfrm_state_delete_security);
 	set_to_dummy_if_null(ops, xfrm_policy_lookup);
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 #ifdef CONFIG_KEYS
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 90b4cdc0c948..cf7b62ca886a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4374,8 +4374,10 @@ static struct security_operations selinux_ops = {
 	.xfrm_policy_alloc_security =	selinux_xfrm_policy_alloc,
 	.xfrm_policy_clone_security =	selinux_xfrm_policy_clone,
 	.xfrm_policy_free_security =	selinux_xfrm_policy_free,
+	.xfrm_policy_delete_security =	selinux_xfrm_policy_delete,
 	.xfrm_state_alloc_security =	selinux_xfrm_state_alloc,
 	.xfrm_state_free_security =	selinux_xfrm_state_free,
+	.xfrm_state_delete_security =	selinux_xfrm_state_delete,
 	.xfrm_policy_lookup = 		selinux_xfrm_policy_lookup,
 #endif
 };
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index c10f1fc41502..f0f4e480ff99 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -9,8 +9,10 @@
 int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
 int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new);
 void selinux_xfrm_policy_free(struct xfrm_policy *xp);
+int selinux_xfrm_policy_delete(struct xfrm_policy *xp);
 int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
 void selinux_xfrm_state_free(struct xfrm_state *x);
+int selinux_xfrm_state_delete(struct xfrm_state *x);
 int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
 
 /*
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index abe99d881376..0e24df41099f 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -132,10 +132,7 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_us
 		goto out;
 
 	/*
-	 * Does the subject have permission to set security or permission to
-	 * do the relabel?
-	 * Must be permitted to relabel from default socket type (process type)
-	 * to specified context
+	 * Does the subject have permission to set security context?
 	 */
 	rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
 			  SECCLASS_ASSOCIATION,
@@ -200,6 +197,23 @@ void selinux_xfrm_policy_free(struct xfrm_policy *xp)
 		kfree(ctx);
 }
 
+/*
+ * LSM hook implementation that authorizes deletion of labeled policies.
+ */
+int selinux_xfrm_policy_delete(struct xfrm_policy *xp)
+{
+	struct task_security_struct *tsec = current->security;
+	struct xfrm_sec_ctx *ctx = xp->security;
+	int rc = 0;
+
+	if (ctx)
+		rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
+				  SECCLASS_ASSOCIATION,
+				  ASSOCIATION__SETCONTEXT, NULL);
+
+	return rc;
+}
+
 /*
  * LSM hook implementation that allocs and transfers sec_ctx spec to
  * xfrm_state.
@@ -292,6 +306,23 @@ u32 selinux_socket_getpeer_dgram(struct sk_buff *skb)
 	return SECSID_NULL;
 }
 
+ /*
+  * LSM hook implementation that authorizes deletion of labeled SAs.
+  */
+int selinux_xfrm_state_delete(struct xfrm_state *x)
+{
+	struct task_security_struct *tsec = current->security;
+	struct xfrm_sec_ctx *ctx = x->security;
+	int rc = 0;
+
+	if (ctx)
+		rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
+				  SECCLASS_ASSOCIATION,
+				  ASSOCIATION__SETCONTEXT, NULL);
+
+	return rc;
+}
+
 /*
  * LSM hook that controls access to unlabelled packets.  If
  * a xfrm_state is authorizable (defined by macro) then it was
-- 
cgit v1.2.3


From 6f68dc37759b1d6ff3b4d4a9d097605a09f8f043 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 8 Jun 2006 23:58:52 -0700
Subject: [NET]: Fix warnings after LSM-IPSEC changes.

Assignment used as truth value in xfrm_del_sa()
and xfrm_get_policy().

Wrong argument type declared for security_xfrm_state_delete()
when SELINUX is disabled.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h | 2 +-
 net/xfrm/xfrm_user.c     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 14c9bd050607..4dfb1b84a9b3 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2995,7 +2995,7 @@ static inline void security_xfrm_state_free(struct xfrm_state *x)
 {
 }
 
-static inline int security_xfrm_state_delete(struct xfrm_policy *xp)
+static inline int security_xfrm_state_delete(struct xfrm_state *x)
 {
 	return 0;
 }
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index a3733d2db3ba..c21dc26141ea 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -427,7 +427,7 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	if (x == NULL)
 		return -ESRCH;
 
-	if (err = security_xfrm_state_delete(x))
+	if ((err = security_xfrm_state_delete(x)) != 0)
 		goto out;
 
 	if (xfrm_state_kern(x)) {
@@ -1057,7 +1057,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 					      MSG_DONTWAIT);
 		}
 	} else {
-		if (err = security_xfrm_policy_delete(xp))
+		if ((err = security_xfrm_policy_delete(xp)) != 0)
 			goto out;
 		c.data.byid = p->index;
 		c.event = nlh->nlmsg_type;
-- 
cgit v1.2.3


From c749b29fae74ed59c507d84025b3298202b42609 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 9 Jun 2006 00:28:25 -0700
Subject: [SECMARK]: Add SELinux exports

Add and export new functions to the in-kernel SELinux API in support of the
new secmark-based packet controls.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/selinux.h    | 32 ++++++++++++++++++++++++++++++++
 security/selinux/exports.c | 22 ++++++++++++++++++++++
 2 files changed, 54 insertions(+)

(limited to 'include')

diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 4047bcde4484..aad4e390d6a5 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -118,6 +118,27 @@ void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid);
  */
 void selinux_get_task_sid(struct task_struct *tsk, u32 *sid);
 
+/**
+ *     selinux_string_to_sid - map a security context string to a security ID
+ *     @str: the security context string to be mapped
+ *     @sid: ID value returned via this.
+ *
+ *     Returns 0 if successful, with the SID stored in sid.  A value
+ *     of zero for sid indicates no SID could be determined (but no error
+ *     occurred).
+ */
+int selinux_string_to_sid(char *str, u32 *sid);
+
+/**
+ *     selinux_relabel_packet_permission - check permission to relabel a packet
+ *     @sid: ID value to be applied to network packet (via SECMARK, most likely)
+ *
+ *     Returns 0 if the current task is allowed to label packets with the
+ *     supplied security ID.  Note that it is implicit that the packet is always
+ *     being relabeled from the default unlabled value, and that the access
+ *     control decision is made in the AVC.
+ */
+int selinux_relabel_packet_permission(u32 sid);
 
 #else
 
@@ -172,6 +193,17 @@ static inline void selinux_get_task_sid(struct task_struct *tsk, u32 *sid)
 	*sid = 0;
 }
 
+static inline int selinux_string_to_sid(const char *str, u32 *sid)
+{
+       *sid = 0;
+       return 0;
+}
+
+static inline int selinux_relabel_packet_permission(u32 sid)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_SECURITY_SELINUX */
 
 #endif /* _LINUX_SELINUX_H */
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index ae4c73eb3085..9d7737db5e51 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -72,3 +72,25 @@ void selinux_get_task_sid(struct task_struct *tsk, u32 *sid)
 	*sid = 0;
 }
 
+int selinux_string_to_sid(char *str, u32 *sid)
+{
+	if (selinux_enabled)
+		return security_context_to_sid(str, strlen(str), sid);
+	else {
+		*sid = 0;
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(selinux_string_to_sid);
+
+int selinux_relabel_packet_permission(u32 sid)
+{
+	if (selinux_enabled) {
+		struct task_security_struct *tsec = current->security;
+
+		return avc_has_perm(tsec->sid, sid, SECCLASS_PACKET,
+				    PACKET__RELABELTO, NULL);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(selinux_relabel_packet_permission);
-- 
cgit v1.2.3


From 984bc16cc92ea3c247bf34ad667cfb95331b9d3c Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 9 Jun 2006 00:29:17 -0700
Subject: [SECMARK]: Add secmark support to core networking.

Add a secmark field to the skbuff structure, to allow security subsystems to
place security markings on network packets.  This is similar to the nfmark
field, except is intended for implementing security policy, rather than than
networking policy.

This patch was already acked in principle by Dave Miller.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h          | 22 ++++++++++++++++++++++
 net/Kconfig                     |  7 +++++++
 net/core/skbuff.c               |  3 ++-
 net/ipv4/ip_output.c            |  1 +
 net/ipv4/netfilter/ipt_REJECT.c |  1 +
 net/ipv6/ip6_output.c           |  1 +
 6 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 23bad3bf3c9d..fe2c58e5306f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -210,6 +210,7 @@ enum {
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@tc_index: Traffic control index
  *	@tc_verd: traffic control verdict
+ *	@secmark: security marking
  */
 
 struct sk_buff {
@@ -289,6 +290,9 @@ struct sk_buff {
 #ifdef CONFIG_NET_DMA
 	dma_cookie_t		dma_cookie;
 #endif
+#ifdef CONFIG_NETWORK_SECMARK
+	__u32			secmark;
+#endif
 
 
 	/* These elements must be at the end, see alloc_skb() for details.  */
@@ -1400,5 +1404,23 @@ static inline void nf_reset(struct sk_buff *skb)
 static inline void nf_reset(struct sk_buff *skb) {}
 #endif /* CONFIG_NETFILTER */
 
+#ifdef CONFIG_NETWORK_SECMARK
+static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
+{
+	to->secmark = from->secmark;
+}
+
+static inline void skb_init_secmark(struct sk_buff *skb)
+{
+	skb->secmark = 0;
+}
+#else
+static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
+{ }
+
+static inline void skb_init_secmark(struct sk_buff *skb)
+{ }
+#endif
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/Kconfig b/net/Kconfig
index ccadc8e48152..c6cec5aa5486 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -66,6 +66,13 @@ source "net/ipv6/Kconfig"
 
 endif # if INET
 
+config NETWORK_SECMARK
+	bool "Security Marking"
+	help
+	  This enables security marking of network packets, similar
+	  to nfmark, but designated for security purposes.
+	  If you are unsure how to answer this question, answer N.
+
 menuconfig NETFILTER
 	bool "Network packet filtering (replaces ipchains)"
 	---help---
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fb3770f9c094..96cdcbe24ba2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -464,7 +464,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
 	C(input_dev);
 #endif
-
+	skb_copy_secmark(n, skb);
 #endif
 	C(truesize);
 	atomic_set(&n->users, 1);
@@ -526,6 +526,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 	new->tc_index	= old->tc_index;
 #endif
+	skb_copy_secmark(new, old);
 	atomic_set(&new->users, 1);
 	skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
 	skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cff9c3a72daf..d4bb3fae4e49 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -410,6 +410,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	nf_bridge_get(to->nf_bridge);
 #endif
 #endif
+	skb_copy_secmark(to, from);
 }
 
 /*
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 0bba3c2bb786..431a3ce6f7b7 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	/* This packet will not be the same as the other: clear nf fields */
 	nf_reset(nskb);
 	nskb->nfmark = 0;
+	skb_init_secmark(nskb);
 
 	tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 416f6e428a0a..d29620f4910e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -459,6 +459,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	nf_bridge_get(to->nf_bridge);
 #endif
 #endif
+	skb_copy_secmark(to, from);
 }
 
 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
-- 
cgit v1.2.3


From 5e6874cdb8de94cd3c15d853a8ef9c6f4c305055 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 9 Jun 2006 00:30:57 -0700
Subject: [SECMARK]: Add xtables SECMARK target

Add a SECMARK target to xtables, allowing the admin to apply security
marks to packets via both iptables and ip6tables.

The target currently handles SELinux security marking, but can be
extended for other purposes as needed.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_SECMARK.h |  26 ++++++
 net/netfilter/Kconfig                |   9 ++
 net/netfilter/Makefile               |   1 +
 net/netfilter/xt_SECMARK.c           | 156 +++++++++++++++++++++++++++++++++++
 4 files changed, 192 insertions(+)
 create mode 100644 include/linux/netfilter/xt_SECMARK.h
 create mode 100644 net/netfilter/xt_SECMARK.c

(limited to 'include')

diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h
new file mode 100644
index 000000000000..c53fbffa997d
--- /dev/null
+++ b/include/linux/netfilter/xt_SECMARK.h
@@ -0,0 +1,26 @@
+#ifndef _XT_SECMARK_H_target
+#define _XT_SECMARK_H_target
+
+/*
+ * This is intended for use by various security subsystems (but not
+ * at the same time).
+ *
+ * 'mode' refers to the specific security subsystem which the
+ * packets are being marked for.
+ */
+#define SECMARK_MODE_SEL	0x01		/* SELinux */
+#define SECMARK_SELCTX_MAX	256
+
+struct xt_secmark_target_selinux_info {
+	u_int32_t selsid;
+	char selctx[SECMARK_SELCTX_MAX];
+};
+
+struct xt_secmark_target_info {
+	u_int8_t mode;
+	union {
+		struct xt_secmark_target_selinux_info sel;
+	} u;
+};
+
+#endif /*_XT_SECMARK_H_target */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 85a7e1770252..10eccdd4d6ea 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -174,6 +174,15 @@ config NETFILTER_XT_TARGET_NOTRACK
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/modules.txt>.  If unsure, say `N'.
 
+config NETFILTER_XT_TARGET_SECMARK
+	tristate '"SECMARK" target support'
+	depends on NETFILTER_XTABLES && NETWORK_SECMARK
+	help
+	  The SECMARK target allows security marking of network
+	  packets, for use with security subsystems.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_MATCH_COMMENT
 	tristate  '"comment" match support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index df1da25f40df..3645de046890 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
 
 # matches
 obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
new file mode 100644
index 000000000000..c2ce9c4011cc
--- /dev/null
+++ b/net/netfilter/xt_SECMARK.c
@@ -0,0 +1,156 @@
+/*
+ * Module for modifying the secmark field of the skb, for use by
+ * security subsystems.
+ *
+ * Based on the nfmark match by:
+ * (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/selinux.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SECMARK.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
+MODULE_DESCRIPTION("ip[6]tables SECMARK modification module");
+MODULE_ALIAS("ipt_SECMARK");
+MODULE_ALIAS("ip6t_SECMARK");
+
+#define PFX "SECMARK: "
+
+static u8 mode;
+
+static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
+			   const struct net_device *out, unsigned int hooknum,
+			   const struct xt_target *target,
+			   const void *targinfo, void *userinfo)
+{
+	u32 secmark = 0;
+	const struct xt_secmark_target_info *info = targinfo;
+
+	BUG_ON(info->mode != mode);
+
+	switch (mode) {
+	case SECMARK_MODE_SEL:
+		secmark = info->u.sel.selsid;
+		break;
+
+	default:
+		BUG();
+	}
+
+	if ((*pskb)->secmark != secmark)
+		(*pskb)->secmark = secmark;
+
+	return XT_CONTINUE;
+}
+
+static int checkentry_selinux(struct xt_secmark_target_info *info)
+{
+	int err;
+	struct xt_secmark_target_selinux_info *sel = &info->u.sel;
+
+	err = selinux_string_to_sid(sel->selctx, &sel->selsid);
+	if (err) {
+		if (err == -EINVAL)
+			printk(KERN_INFO PFX "invalid SELinux context \'%s\'\n",
+			       sel->selctx);
+		return 0;
+	}
+
+	if (!sel->selsid) {
+		printk(KERN_INFO PFX "unable to map SELinux context \'%s\'\n",
+		       sel->selctx);
+		return 0;
+	}
+
+	err = selinux_relabel_packet_permission(sel->selsid);
+	if (err) {
+		printk(KERN_INFO PFX "unable to obtain relabeling permission\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static int checkentry(const char *tablename, const void *entry,
+		      const struct xt_target *target, void *targinfo,
+		      unsigned int targinfosize, unsigned int hook_mask)
+{
+	struct xt_secmark_target_info *info = targinfo;
+
+	if (mode && mode != info->mode) {
+		printk(KERN_INFO PFX "mode already set to %hu cannot mix with "
+		       "rules for mode %hu\n", mode, info->mode);
+		return 0;
+	}
+
+	switch (info->mode) {
+	case SECMARK_MODE_SEL:
+		if (!checkentry_selinux(info))
+			return 0;
+		break;
+
+	default:
+		printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode);
+		return 0;
+	}
+
+	if (!mode)
+		mode = info->mode;
+	return 1;
+}
+
+static struct xt_target ipt_secmark_reg = {
+	.name		= "SECMARK",
+	.target		= target,
+	.targetsize	= sizeof(struct xt_secmark_target_info),
+	.table		= "mangle",
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+	.family		= AF_INET,
+	.revision	= 0,
+};
+
+static struct xt_target ip6t_secmark_reg = {
+	.name		= "SECMARK",
+	.target		= target,
+	.targetsize	= sizeof(struct xt_secmark_target_info),
+	.table		= "mangle",
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+	.family		= AF_INET6,
+	.revision	= 0,
+};
+
+static int __init xt_secmark_init(void)
+{
+	int err;
+
+	err = xt_register_target(&ipt_secmark_reg);
+	if (err)
+		return err;
+
+	err = xt_register_target(&ip6t_secmark_reg);
+	if (err)
+		xt_unregister_target(&ipt_secmark_reg);
+
+	return err;
+}
+
+static void __exit xt_secmark_fini(void)
+{
+	xt_unregister_target(&ip6t_secmark_reg);
+	xt_unregister_target(&ipt_secmark_reg);
+}
+
+module_init(xt_secmark_init);
+module_exit(xt_secmark_fini);
-- 
cgit v1.2.3


From 7c9728c393dceb724d66d696cfabce82151a78e5 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 9 Jun 2006 00:31:46 -0700
Subject: [SECMARK]: Add secmark support to conntrack

Add a secmark field to IP and NF conntracks, so that security markings
on packets can be copied to their associated connections, and also
copied back to packets as required.  This is similar to the network
mark field currently used with conntrack, although it is intended for
enforcement of security policy rather than network policy.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h  |  4 ++++
 include/net/netfilter/nf_conntrack.h         |  4 ++++
 include/net/netfilter/nf_conntrack_compat.h  | 26 ++++++++++++++++++++++++++
 net/ipv4/netfilter/Kconfig                   | 12 ++++++++++++
 net/ipv4/netfilter/ip_conntrack_core.c       |  3 +++
 net/ipv4/netfilter/ip_conntrack_standalone.c |  5 +++++
 net/netfilter/Kconfig                        | 12 ++++++++++++
 net/netfilter/nf_conntrack_core.c            |  3 +++
 net/netfilter/nf_conntrack_standalone.c      |  5 +++++
 9 files changed, 74 insertions(+)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 17d7ef938a09..e0e9951eb8c3 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -121,6 +121,10 @@ struct ip_conntrack
 	u_int32_t mark;
 #endif
 
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+	u_int32_t secmark;
+#endif
+
 	/* Traversed often, so hopefully in different cacheline to top */
 	/* These are my tuples; original and reply */
 	struct ip_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index dbe7a114d0c5..411117815807 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -114,6 +114,10 @@ struct nf_conn
 	u_int32_t mark;
 #endif
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+	u_int32_t secmark;
+#endif
+
 	/* Storage reserved for other modules: */
 	union nf_conntrack_proto proto;
 
diff --git a/include/net/netfilter/nf_conntrack_compat.h b/include/net/netfilter/nf_conntrack_compat.h
index 3cac19fb3648..f1b1482d7200 100644
--- a/include/net/netfilter/nf_conntrack_compat.h
+++ b/include/net/netfilter/nf_conntrack_compat.h
@@ -20,6 +20,19 @@ static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb,
 }
 #endif /* CONFIG_IP_NF_CONNTRACK_MARK */
 
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+static inline u_int32_t *nf_ct_get_secmark(const struct sk_buff *skb,
+					   u_int32_t *ctinfo)
+{
+	struct ip_conntrack *ct = ip_conntrack_get(skb, ctinfo);
+
+	if (ct)
+		return &ct->secmark;
+	else
+		return NULL;
+}
+#endif /* CONFIG_IP_NF_CONNTRACK_SECMARK */
+
 #ifdef CONFIG_IP_NF_CT_ACCT
 static inline struct ip_conntrack_counter *
 nf_ct_get_counters(const struct sk_buff *skb)
@@ -70,6 +83,19 @@ static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb,
 }
 #endif /* CONFIG_NF_CONNTRACK_MARK */
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static inline u_int32_t *nf_ct_get_secmark(const struct sk_buff *skb,
+					   u_int32_t *ctinfo)
+{
+	struct nf_conn *ct = nf_ct_get(skb, ctinfo);
+
+	if (ct)
+		return &ct->secmark;
+	else
+		return NULL;
+}
+#endif /* CONFIG_NF_CONNTRACK_MARK */
+
 #ifdef CONFIG_NF_CT_ACCT
 static inline struct ip_conntrack_counter *
 nf_ct_get_counters(const struct sk_buff *skb)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index ff4b118f14a9..e1d7f5fbc526 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK
 	  of packets, but this mark value is kept in the conntrack session
 	  instead of the individual packets.
 	
+config IP_NF_CONNTRACK_SECMARK
+	bool  'Connection tracking security mark support'
+	depends on IP_NF_CONNTRACK && NETWORK_SECMARK
+	help
+	  This option enables security markings to be applied to
+	  connections.  Typically they are copied to connections from
+	  packets using the CONNSECMARK target and copied back from
+	  connections to packets with the same target, with the packets
+	  being originally labeled via SECMARK.
+
+	  If unsure, say 'N'.
+
 config IP_NF_CONNTRACK_EVENTS
 	bool "Connection tracking events (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && IP_NF_CONNTRACK
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 4fe9e69378df..7e4cf9a4d15f 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -723,6 +723,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple,
     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
 		/* this is ugly, but there is no other place where to put it */
 		conntrack->nat.masq_index = exp->master->nat.masq_index;
+#endif
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+		conntrack->secmark = exp->master->secmark;
 #endif
 		nf_conntrack_get(&conntrack->master->ct_general);
 		CONNTRACK_STAT_INC(expect_new);
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 6cb9b989d14c..88445aac3f28 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		return -ENOSPC;
 #endif
 
+#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
+	if (seq_printf(s, "secmark=%u ", conntrack->secmark))
+		return -ENOSPC;
+#endif
+
 	if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
 		return -ENOSPC;
 
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 10eccdd4d6ea..023f81e5f96b 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -60,6 +60,18 @@ config NF_CONNTRACK_MARK
 	  of packets, but this mark value is kept in the conntrack session
 	  instead of the individual packets.
 
+config NF_CONNTRACK_SECMARK
+	bool  'Connection tracking security mark support'
+	depends on NF_CONNTRACK && NETWORK_SECMARK
+	help
+	  This option enables security markings to be applied to
+	  connections.  Typically they are copied to connections from
+	  packets using the CONNSECMARK target and copied back from
+	  connections to packets with the same target, with the packets
+	  being originally labeled via SECMARK.
+
+	  If unsure, say 'N'.
+
 config NF_CONNTRACK_EVENTS
 	bool "Connection tracking events (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && NF_CONNTRACK
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index bc2bd4c3859e..cd299f4b7db1 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -989,6 +989,9 @@ init_conntrack(const struct nf_conntrack_tuple *tuple,
 		conntrack->master = exp->master;
 #ifdef CONFIG_NF_CONNTRACK_MARK
 		conntrack->mark = exp->master->mark;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+		conntrack->secmark = exp->master->secmark;
 #endif
 		nf_conntrack_get(&conntrack->master->ct_general);
 		NF_CT_STAT_INC(expect_new);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index e01d20d8e287..e34c574f0351 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -213,6 +213,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		return -ENOSPC;
 #endif
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+	if (seq_printf(s, "secmark=%u ", conntrack->secmark))
+		return -ENOSPC;
+#endif
+
 	if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
 		return -ENOSPC;
 	
-- 
cgit v1.2.3


From 100468e9c05c10fb6872751c1af523b996d6afa9 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 9 Jun 2006 00:32:39 -0700
Subject: [SECMARK]: Add CONNSECMARK xtables target

Add a new xtables target, CONNSECMARK, which is used to specify rules
for copying security marks from packets to connections, and for
copyying security marks back from connections to packets.  This is
similar to the CONNMARK target, but is more limited in scope in that
it only allows copying of security marks to and from packets, as this
is all it needs to do.

A typical scenario would be to apply a security mark to a 'new' packet
with SECMARK, then copy that to its conntrack via CONNMARK, and then
restore the security mark from the connection to established and
related packets on that connection.

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_CONNSECMARK.h |  13 +++
 net/netfilter/Kconfig                    |  11 +++
 net/netfilter/Makefile                   |   1 +
 net/netfilter/xt_CONNSECMARK.c           | 155 +++++++++++++++++++++++++++++++
 4 files changed, 180 insertions(+)
 create mode 100644 include/linux/netfilter/xt_CONNSECMARK.h
 create mode 100644 net/netfilter/xt_CONNSECMARK.c

(limited to 'include')

diff --git a/include/linux/netfilter/xt_CONNSECMARK.h b/include/linux/netfilter/xt_CONNSECMARK.h
new file mode 100644
index 000000000000..c6bd75469ba2
--- /dev/null
+++ b/include/linux/netfilter/xt_CONNSECMARK.h
@@ -0,0 +1,13 @@
+#ifndef _XT_CONNSECMARK_H_target
+#define _XT_CONNSECMARK_H_target
+
+enum {
+	CONNSECMARK_SAVE = 1,
+	CONNSECMARK_RESTORE,
+};
+
+struct xt_connsecmark_target_info {
+	u_int8_t mode;
+};
+
+#endif /*_XT_CONNSECMARK_H_target */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 023f81e5f96b..b1622b7de1cf 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -195,6 +195,17 @@ config NETFILTER_XT_TARGET_SECMARK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_TARGET_CONNSECMARK
+	tristate '"CONNSECMARK" target support'
+	depends on NETFILTER_XTABLES && (NF_CONNTRACK_SECMARK || IP_NF_CONNTRACK_SECMARK)
+	help
+	  The CONNSECMARK target copies security markings from packets
+	  to connections, and restores security markings from connections
+	  to packets (if the packets are not already marked).  This would
+	  normally be used in conjunction with the SECMARK target.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_MATCH_COMMENT
 	tristate  '"comment" match support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3645de046890..6fa4b7580458 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 
 # matches
 obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
new file mode 100644
index 000000000000..8c011e020769
--- /dev/null
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -0,0 +1,155 @@
+/*
+ * This module is used to copy security markings from packets
+ * to connections, and restore security markings from connections
+ * back to packets.  This would normally be performed in conjunction
+ * with the SECMARK target and state match.
+ *
+ * Based somewhat on CONNMARK:
+ *   Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ *    by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CONNSECMARK.h>
+#include <net/netfilter/nf_conntrack_compat.h>
+
+#define PFX "CONNSECMARK: "
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
+MODULE_DESCRIPTION("ip[6]tables CONNSECMARK module");
+MODULE_ALIAS("ipt_CONNSECMARK");
+MODULE_ALIAS("ip6t_CONNSECMARK");
+
+/*
+ * If the packet has a security mark and the connection does not, copy
+ * the security mark from the packet to the connection.
+ */
+static void secmark_save(struct sk_buff *skb)
+{
+	if (skb->secmark) {
+		u32 *connsecmark;
+		enum ip_conntrack_info ctinfo;
+
+		connsecmark = nf_ct_get_secmark(skb, &ctinfo);
+		if (connsecmark && !*connsecmark)
+			if (*connsecmark != skb->secmark)
+				*connsecmark = skb->secmark;
+	}
+}
+
+/*
+ * If packet has no security mark, and the connection does, restore the
+ * security mark from the connection to the packet.
+ */
+static void secmark_restore(struct sk_buff *skb)
+{
+	if (!skb->secmark) {
+		u32 *connsecmark;
+		enum ip_conntrack_info ctinfo;
+
+		connsecmark = nf_ct_get_secmark(skb, &ctinfo);
+		if (connsecmark && *connsecmark)
+			if (skb->secmark != *connsecmark)
+				skb->secmark = *connsecmark;
+	}
+}
+
+static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
+			   const struct net_device *out, unsigned int hooknum,
+			   const struct xt_target *target,
+			   const void *targinfo, void *userinfo)
+{
+	struct sk_buff *skb = *pskb;
+	const struct xt_connsecmark_target_info *info = targinfo;
+
+	switch (info->mode) {
+	case CONNSECMARK_SAVE:
+		secmark_save(skb);
+		break;
+
+	case CONNSECMARK_RESTORE:
+		secmark_restore(skb);
+		break;
+
+	default:
+		BUG();
+	}
+
+	return XT_CONTINUE;
+}
+
+static int checkentry(const char *tablename, const void *entry,
+		      const struct xt_target *target, void *targinfo,
+		      unsigned int targinfosize, unsigned int hook_mask)
+{
+	struct xt_connsecmark_target_info *info = targinfo;
+
+	switch (info->mode) {
+	case CONNSECMARK_SAVE:
+	case CONNSECMARK_RESTORE:
+		break;
+
+	default:
+		printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_target ipt_connsecmark_reg = {
+	.name		= "CONNSECMARK",
+	.target		= target,
+	.targetsize	= sizeof(struct xt_connsecmark_target_info),
+	.table		= "mangle",
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+	.family		= AF_INET,
+	.revision	= 0,
+};
+
+static struct xt_target ip6t_connsecmark_reg = {
+	.name		= "CONNSECMARK",
+	.target		= target,
+	.targetsize	= sizeof(struct xt_connsecmark_target_info),
+	.table		= "mangle",
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+	.family		= AF_INET6,
+	.revision	= 0,
+};
+
+static int __init xt_connsecmark_init(void)
+{
+	int err;
+
+	need_conntrack();
+
+	err = xt_register_target(&ipt_connsecmark_reg);
+	if (err)
+		return err;
+
+	err = xt_register_target(&ip6t_connsecmark_reg);
+	if (err)
+		xt_unregister_target(&ipt_connsecmark_reg);
+
+	return err;
+}
+
+static void __exit xt_connsecmark_fini(void)
+{
+	xt_unregister_target(&ip6t_connsecmark_reg);
+	xt_unregister_target(&ipt_connsecmark_reg);
+}
+
+module_init(xt_connsecmark_init);
+module_exit(xt_connsecmark_fini);
-- 
cgit v1.2.3


From 932ff279a43ab7257942cddff2595acd541cc49b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 9 Jun 2006 12:20:56 -0700
Subject: [NET]: Add netif_tx_lock

Various drivers use xmit_lock internally to synchronise with their
transmission routines.  They do so without setting xmit_lock_owner.
This is fine as long as netpoll is not in use.

With netpoll it is possible for deadlocks to occur if xmit_lock_owner
isn't set.  This is because if a printk occurs while xmit_lock is held
and xmit_lock_owner is not set can cause netpoll to attempt to take
xmit_lock recursively.

While it is possible to resolve this by getting netpoll to use
trylock, it is suboptimal because netpoll's sole objective is to
maximise the chance of getting the printk out on the wire.  So
delaying or dropping the message is to be avoided as much as possible.

So the only alternative is to always set xmit_lock_owner.  The
following patch does this by introducing the netif_tx_lock family of
functions that take care of setting/unsetting xmit_lock_owner.

I renamed xmit_lock to _xmit_lock to indicate that it should not be
used directly.  I didn't provide irq versions of the netif_tx_lock
functions since xmit_lock is meant to be a BH-disabling lock.

This is pretty much a straight text substitution except for a small
bug fix in winbond.  It currently uses
netif_stop_queue/spin_unlock_wait to stop transmission.  This is
unsafe as an IRQ can potentially wake up the queue.  So it is safer to
use netif_tx_disable.

The hamradio bits used spin_lock_irq but it is unnecessary as
xmit_lock must never be taken in an IRQ handler.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdevices.txt        |  8 +++---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |  6 ++--
 drivers/media/dvb/dvb-core/dvb_net.c           |  4 +--
 drivers/net/bnx2.c                             |  4 +--
 drivers/net/bonding/bond_main.c                |  2 +-
 drivers/net/forcedeth.c                        | 18 ++++++------
 drivers/net/hamradio/6pack.c                   |  8 +++---
 drivers/net/hamradio/mkiss.c                   |  8 +++---
 drivers/net/ifb.c                              | 10 +++----
 drivers/net/irda/vlsi_ir.c                     |  2 +-
 drivers/net/natsemi.c                          |  4 +--
 drivers/net/tulip/winbond-840.c                |  9 +++---
 drivers/net/wireless/orinoco.c                 |  4 ++-
 include/linux/netdevice.h                      | 38 ++++++++++++++++++++++++--
 net/atm/clip.c                                 |  4 +--
 net/core/dev.c                                 | 12 ++++----
 net/core/dev_mcast.c                           | 28 +++++++++----------
 net/core/netpoll.c                             |  9 ++----
 net/core/pktgen.c                              |  4 +--
 net/sched/sch_generic.c                        | 28 ++++++++-----------
 net/sched/sch_teql.c                           |  9 ++----
 21 files changed, 121 insertions(+), 98 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt
index 3c0a5ba614d7..847cedb238f6 100644
--- a/Documentation/networking/netdevices.txt
+++ b/Documentation/networking/netdevices.txt
@@ -42,9 +42,9 @@ dev->get_stats:
 	Context: nominally process, but don't sleep inside an rwlock
 
 dev->hard_start_xmit:
-	Synchronization: dev->xmit_lock spinlock.
+	Synchronization: netif_tx_lock spinlock.
 	When the driver sets NETIF_F_LLTX in dev->features this will be
-	called without holding xmit_lock. In this case the driver 
+	called without holding netif_tx_lock. In this case the driver
 	has to lock by itself when needed. It is recommended to use a try lock
 	for this and return -1 when the spin lock fails. 
 	The locking there should also properly protect against 
@@ -62,12 +62,12 @@ dev->hard_start_xmit:
 	  Only valid when NETIF_F_LLTX is set.
 
 dev->tx_timeout:
-	Synchronization: dev->xmit_lock spinlock.
+	Synchronization: netif_tx_lock spinlock.
 	Context: BHs disabled
 	Notes: netif_queue_stopped() is guaranteed true
 
 dev->set_multicast_list:
-	Synchronization: dev->xmit_lock spinlock.
+	Synchronization: netif_tx_lock spinlock.
 	Context: BHs disabled
 
 dev->poll:
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 1dae4b238252..1d917edcf9ba 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -821,7 +821,8 @@ void ipoib_mcast_restart_task(void *dev_ptr)
 
 	ipoib_mcast_stop_thread(dev, 0);
 
-	spin_lock_irqsave(&dev->xmit_lock, flags);
+	local_irq_save(flags);
+	netif_tx_lock(dev);
 	spin_lock(&priv->lock);
 
 	/*
@@ -896,7 +897,8 @@ void ipoib_mcast_restart_task(void *dev_ptr)
 	}
 
 	spin_unlock(&priv->lock);
-	spin_unlock_irqrestore(&dev->xmit_lock, flags);
+	netif_tx_unlock(dev);
+	local_irq_restore(flags);
 
 	/* We have to cancel outside of the spinlock */
 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
diff --git a/drivers/media/dvb/dvb-core/dvb_net.c b/drivers/media/dvb/dvb-core/dvb_net.c
index 2f0f35811bf7..9fd87521a163 100644
--- a/drivers/media/dvb/dvb-core/dvb_net.c
+++ b/drivers/media/dvb/dvb-core/dvb_net.c
@@ -1052,7 +1052,7 @@ static void wq_set_multicast_list (void *data)
 
 	dvb_net_feed_stop(dev);
 	priv->rx_mode = RX_MODE_UNI;
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 
 	if (dev->flags & IFF_PROMISC) {
 		dprintk("%s: promiscuous mode\n", dev->name);
@@ -1077,7 +1077,7 @@ static void wq_set_multicast_list (void *data)
 		}
 	}
 
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 	dvb_net_feed_start(dev);
 }
 
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 54161aef3cac..9c5a8842ed0f 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -2009,7 +2009,7 @@ bnx2_poll(struct net_device *dev, int *budget)
 	return 1;
 }
 
-/* Called with rtnl_lock from vlan functions and also dev->xmit_lock
+/* Called with rtnl_lock from vlan functions and also netif_tx_lock
  * from set_multicast.
  */
 static void
@@ -4252,7 +4252,7 @@ bnx2_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid)
 }
 #endif
 
-/* Called with dev->xmit_lock.
+/* Called with netif_tx_lock.
  * hard_start_xmit is pseudo-lockless - a lock is only required when
  * the tx queue is full. This way, we get the benefit of lockless
  * operations most of the time without the complexities to handle
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 55d236726d11..46326cdfb277 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4191,7 +4191,7 @@ static int bond_init(struct net_device *bond_dev, struct bond_params *params)
 	 */
 	bond_dev->features |= NETIF_F_VLAN_CHALLENGED;
 
-	/* don't acquire bond device's xmit_lock when 
+	/* don't acquire bond device's netif_tx_lock when
 	 * transmitting */
 	bond_dev->features |= NETIF_F_LLTX;
 
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index feb5b223cd60..5a8651b4b01d 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -533,9 +533,9 @@ typedef union _ring_type {
  * critical parts:
  * - rx is (pseudo-) lockless: it relies on the single-threading provided
  *	by the arch code for interrupts.
- * - tx setup is lockless: it relies on dev->xmit_lock. Actual submission
+ * - tx setup is lockless: it relies on netif_tx_lock. Actual submission
  *	needs dev->priv->lock :-(
- * - set_multicast_list: preparation lockless, relies on dev->xmit_lock.
+ * - set_multicast_list: preparation lockless, relies on netif_tx_lock.
  */
 
 /* in dev: base, irq */
@@ -1213,7 +1213,7 @@ static void drain_ring(struct net_device *dev)
 
 /*
  * nv_start_xmit: dev->hard_start_xmit function
- * Called with dev->xmit_lock held.
+ * Called with netif_tx_lock held.
  */
 static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
@@ -1407,7 +1407,7 @@ static void nv_tx_done(struct net_device *dev)
 
 /*
  * nv_tx_timeout: dev->tx_timeout function
- * Called with dev->xmit_lock held.
+ * Called with netif_tx_lock held.
  */
 static void nv_tx_timeout(struct net_device *dev)
 {
@@ -1737,7 +1737,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu)
 		 * Changing the MTU is a rare event, it shouldn't matter.
 		 */
 		nv_disable_irq(dev);
-		spin_lock_bh(&dev->xmit_lock);
+		netif_tx_lock_bh(dev);
 		spin_lock(&np->lock);
 		/* stop engines */
 		nv_stop_rx(dev);
@@ -1768,7 +1768,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu)
 		nv_start_rx(dev);
 		nv_start_tx(dev);
 		spin_unlock(&np->lock);
-		spin_unlock_bh(&dev->xmit_lock);
+		netif_tx_unlock_bh(dev);
 		nv_enable_irq(dev);
 	}
 	return 0;
@@ -1803,7 +1803,7 @@ static int nv_set_mac_address(struct net_device *dev, void *addr)
 	memcpy(dev->dev_addr, macaddr->sa_data, ETH_ALEN);
 
 	if (netif_running(dev)) {
-		spin_lock_bh(&dev->xmit_lock);
+		netif_tx_lock_bh(dev);
 		spin_lock_irq(&np->lock);
 
 		/* stop rx engine */
@@ -1815,7 +1815,7 @@ static int nv_set_mac_address(struct net_device *dev, void *addr)
 		/* restart rx engine */
 		nv_start_rx(dev);
 		spin_unlock_irq(&np->lock);
-		spin_unlock_bh(&dev->xmit_lock);
+		netif_tx_unlock_bh(dev);
 	} else {
 		nv_copy_mac_to_hw(dev);
 	}
@@ -1824,7 +1824,7 @@ static int nv_set_mac_address(struct net_device *dev, void *addr)
 
 /*
  * nv_set_multicast: dev->set_multicast function
- * Called with dev->xmit_lock held.
+ * Called with netif_tx_lock held.
  */
 static void nv_set_multicast(struct net_device *dev)
 {
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 102c1f0b90da..d12605f0ac7c 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -308,9 +308,9 @@ static int sp_set_mac_address(struct net_device *dev, void *addr)
 {
 	struct sockaddr_ax25 *sa = addr;
 
-	spin_lock_irq(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN);
-	spin_unlock_irq(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 
 	return 0;
 }
@@ -767,9 +767,9 @@ static int sixpack_ioctl(struct tty_struct *tty, struct file *file,
 			break;
 		}
 
-		spin_lock_irq(&dev->xmit_lock);
+		netif_tx_lock_bh(dev);
 		memcpy(dev->dev_addr, &addr, AX25_ADDR_LEN);
-		spin_unlock_irq(&dev->xmit_lock);
+		netif_tx_unlock_bh(dev);
 
 		err = 0;
 		break;
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index d81a8e1eeb8d..3ebbbe56b6e9 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -357,9 +357,9 @@ static int ax_set_mac_address(struct net_device *dev, void *addr)
 {
 	struct sockaddr_ax25 *sa = addr;
 
-	spin_lock_irq(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN);
-	spin_unlock_irq(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 
 	return 0;
 }
@@ -886,9 +886,9 @@ static int mkiss_ioctl(struct tty_struct *tty, struct file *file,
 			break;
 		}
 
-		spin_lock_irq(&dev->xmit_lock);
+		netif_tx_lock_bh(dev);
 		memcpy(dev->dev_addr, addr, AX25_ADDR_LEN);
-		spin_unlock_irq(&dev->xmit_lock);
+		netif_tx_unlock_bh(dev);
 
 		err = 0;
 		break;
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 31fb2d75dc44..2e222ef91e22 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -76,13 +76,13 @@ static void ri_tasklet(unsigned long dev)
 	dp->st_task_enter++;
 	if ((skb = skb_peek(&dp->tq)) == NULL) {
 		dp->st_txq_refl_try++;
-		if (spin_trylock(&_dev->xmit_lock)) {
+		if (netif_tx_trylock(_dev)) {
 			dp->st_rxq_enter++;
 			while ((skb = skb_dequeue(&dp->rq)) != NULL) {
 				skb_queue_tail(&dp->tq, skb);
 				dp->st_rx2tx_tran++;
 			}
-			spin_unlock(&_dev->xmit_lock);
+			netif_tx_unlock(_dev);
 		} else {
 			/* reschedule */
 			dp->st_rxq_notenter++;
@@ -110,7 +110,7 @@ static void ri_tasklet(unsigned long dev)
 		}
 	}
 
-	if (spin_trylock(&_dev->xmit_lock)) {
+	if (netif_tx_trylock(_dev)) {
 		dp->st_rxq_check++;
 		if ((skb = skb_peek(&dp->rq)) == NULL) {
 			dp->tasklet_pending = 0;
@@ -118,10 +118,10 @@ static void ri_tasklet(unsigned long dev)
 				netif_wake_queue(_dev);
 		} else {
 			dp->st_rxq_rsch++;
-			spin_unlock(&_dev->xmit_lock);
+			netif_tx_unlock(_dev);
 			goto resched;
 		}
-		spin_unlock(&_dev->xmit_lock);
+		netif_tx_unlock(_dev);
 	} else {
 resched:
 		dp->tasklet_pending = 1;
diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c
index 97a49e0be76b..d70b9e8d6e60 100644
--- a/drivers/net/irda/vlsi_ir.c
+++ b/drivers/net/irda/vlsi_ir.c
@@ -959,7 +959,7 @@ static int vlsi_hard_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 			    ||  (now.tv_sec==ready.tv_sec && now.tv_usec>=ready.tv_usec))
 			    	break;
 			udelay(100);
-			/* must not sleep here - we are called under xmit_lock! */
+			/* must not sleep here - called under netif_tx_lock! */
 		}
 	}
 
diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c
index 90627756d6fa..2e4ecedba057 100644
--- a/drivers/net/natsemi.c
+++ b/drivers/net/natsemi.c
@@ -318,12 +318,12 @@ performance critical codepaths:
 The rx process only runs in the interrupt handler. Access from outside
 the interrupt handler is only permitted after disable_irq().
 
-The rx process usually runs under the dev->xmit_lock. If np->intr_tx_reap
+The rx process usually runs under the netif_tx_lock. If np->intr_tx_reap
 is set, then access is permitted under spin_lock_irq(&np->lock).
 
 Thus configuration functions that want to access everything must call
 	disable_irq(dev->irq);
-	spin_lock_bh(dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	spin_lock_irq(&np->lock);
 
 IV. Notes
diff --git a/drivers/net/tulip/winbond-840.c b/drivers/net/tulip/winbond-840.c
index 136a70c4d5e4..56d86c7c0386 100644
--- a/drivers/net/tulip/winbond-840.c
+++ b/drivers/net/tulip/winbond-840.c
@@ -1605,11 +1605,11 @@ static void __devexit w840_remove1 (struct pci_dev *pdev)
  * - get_stats:
  * 	spin_lock_irq(np->lock), doesn't touch hw if not present
  * - hard_start_xmit:
- * 	netif_stop_queue + spin_unlock_wait(&dev->xmit_lock);
+ * 	synchronize_irq + netif_tx_disable;
  * - tx_timeout:
- * 	netif_device_detach + spin_unlock_wait(&dev->xmit_lock);
+ * 	netif_device_detach + netif_tx_disable;
  * - set_multicast_list
- * 	netif_device_detach + spin_unlock_wait(&dev->xmit_lock);
+ * 	netif_device_detach + netif_tx_disable;
  * - interrupt handler
  * 	doesn't touch hw if not present, synchronize_irq waits for
  * 	running instances of the interrupt handler.
@@ -1635,11 +1635,10 @@ static int w840_suspend (struct pci_dev *pdev, pm_message_t state)
 		netif_device_detach(dev);
 		update_csr6(dev, 0);
 		iowrite32(0, ioaddr + IntrEnable);
-		netif_stop_queue(dev);
 		spin_unlock_irq(&np->lock);
 
-		spin_unlock_wait(&dev->xmit_lock);
 		synchronize_irq(dev->irq);
+		netif_tx_disable(dev);
 	
 		np->stats.rx_missed_errors += ioread32(ioaddr + RxMissed) & 0xffff;
 
diff --git a/drivers/net/wireless/orinoco.c b/drivers/net/wireless/orinoco.c
index c2d0b09e0418..a5fcfcde63d1 100644
--- a/drivers/net/wireless/orinoco.c
+++ b/drivers/net/wireless/orinoco.c
@@ -1833,7 +1833,9 @@ static int __orinoco_program_rids(struct net_device *dev)
 	/* Set promiscuity / multicast*/
 	priv->promiscuous = 0;
 	priv->mc_count = 0;
-	__orinoco_set_multicast_list(dev); /* FIXME: what about the xmit_lock */
+
+	/* FIXME: what about netif_tx_lock */
+	__orinoco_set_multicast_list(dev);
 
 	return 0;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b5760c67af9c..067b9ccafd87 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -407,7 +407,7 @@ struct net_device
  * One part is mostly used on xmit path (device)
  */
 	/* hard_start_xmit synchronizer */
-	spinlock_t		xmit_lock ____cacheline_aligned_in_smp;
+	spinlock_t		_xmit_lock ____cacheline_aligned_in_smp;
 	/* cpu id of processor entered to hard_start_xmit or -1,
 	   if nobody entered there.
 	 */
@@ -893,11 +893,43 @@ static inline void __netif_rx_complete(struct net_device *dev)
 	clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
 }
 
+static inline void netif_tx_lock(struct net_device *dev)
+{
+	spin_lock(&dev->_xmit_lock);
+	dev->xmit_lock_owner = smp_processor_id();
+}
+
+static inline void netif_tx_lock_bh(struct net_device *dev)
+{
+	spin_lock_bh(&dev->_xmit_lock);
+	dev->xmit_lock_owner = smp_processor_id();
+}
+
+static inline int netif_tx_trylock(struct net_device *dev)
+{
+	int err = spin_trylock(&dev->_xmit_lock);
+	if (!err)
+		dev->xmit_lock_owner = smp_processor_id();
+	return err;
+}
+
+static inline void netif_tx_unlock(struct net_device *dev)
+{
+	dev->xmit_lock_owner = -1;
+	spin_unlock(&dev->_xmit_lock);
+}
+
+static inline void netif_tx_unlock_bh(struct net_device *dev)
+{
+	dev->xmit_lock_owner = -1;
+	spin_unlock_bh(&dev->_xmit_lock);
+}
+
 static inline void netif_tx_disable(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	netif_stop_queue(dev);
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 }
 
 /* These functions live elsewhere (drivers/net/net_init.c, but related) */
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 72d852982664..f92f9c94d2c7 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -98,7 +98,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
 		printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
 		return;
 	}
-	spin_lock_bh(&entry->neigh->dev->xmit_lock);	/* block clip_start_xmit() */
+	netif_tx_lock_bh(entry->neigh->dev);	/* block clip_start_xmit() */
 	entry->neigh->used = jiffies;
 	for (walk = &entry->vccs; *walk; walk = &(*walk)->next)
 		if (*walk == clip_vcc) {
@@ -122,7 +122,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
 	printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc "
 	       "0x%p)\n", entry, clip_vcc);
       out:
-	spin_unlock_bh(&entry->neigh->dev->xmit_lock);
+	netif_tx_unlock_bh(entry->neigh->dev);
 }
 
 /* The neighbour entry n->lock is held. */
diff --git a/net/core/dev.c b/net/core/dev.c
index 6bfa78c66c25..1b09f1cae46e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1282,15 +1282,13 @@ int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
 
 #define HARD_TX_LOCK(dev, cpu) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
-		spin_lock(&dev->xmit_lock);		\
-		dev->xmit_lock_owner = cpu;		\
+		netif_tx_lock(dev);			\
 	}						\
 }
 
 #define HARD_TX_UNLOCK(dev) {				\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
-		dev->xmit_lock_owner = -1;		\
-		spin_unlock(&dev->xmit_lock);		\
+		netif_tx_unlock(dev);			\
 	}						\
 }
 
@@ -1389,8 +1387,8 @@ int dev_queue_xmit(struct sk_buff *skb)
 	/* The device has no queue. Common case for software devices:
 	   loopback, all the sorts of tunnels...
 
-	   Really, it is unlikely that xmit_lock protection is necessary here.
-	   (f.e. loopback and IP tunnels are clean ignoring statistics
+	   Really, it is unlikely that netif_tx_lock protection is necessary
+	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 	   counters.)
 	   However, it is possible, that they rely on protection
 	   made by us here.
@@ -2805,7 +2803,7 @@ int register_netdevice(struct net_device *dev)
 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 
 	spin_lock_init(&dev->queue_lock);
-	spin_lock_init(&dev->xmit_lock);
+	spin_lock_init(&dev->_xmit_lock);
 	dev->xmit_lock_owner = -1;
 #ifdef CONFIG_NET_CLS_ACT
 	spin_lock_init(&dev->ingress_lock);
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index 05d60850840e..c57d887da2ef 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -62,7 +62,7 @@
  *	Device mc lists are changed by bh at least if IPv6 is enabled,
  *	so that it must be bh protected.
  *
- *	We block accesses to device mc filters with dev->xmit_lock.
+ *	We block accesses to device mc filters with netif_tx_lock.
  */
 
 /*
@@ -93,9 +93,9 @@ static void __dev_mc_upload(struct net_device *dev)
 
 void dev_mc_upload(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	__dev_mc_upload(dev);
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 }
 
 /*
@@ -107,7 +107,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
 	int err = 0;
 	struct dev_mc_list *dmi, **dmip;
 
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 
 	for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
 		/*
@@ -139,13 +139,13 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
 			 */
 			__dev_mc_upload(dev);
 			
-			spin_unlock_bh(&dev->xmit_lock);
+			netif_tx_unlock_bh(dev);
 			return 0;
 		}
 	}
 	err = -ENOENT;
 done:
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 	return err;
 }
 
@@ -160,7 +160,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 
 	dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC);
 
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
 		if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
 		    dmi->dmi_addrlen == alen) {
@@ -176,7 +176,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 	}
 
 	if ((dmi = dmi1) == NULL) {
-		spin_unlock_bh(&dev->xmit_lock);
+		netif_tx_unlock_bh(dev);
 		return -ENOMEM;
 	}
 	memcpy(dmi->dmi_addr, addr, alen);
@@ -189,11 +189,11 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 
 	__dev_mc_upload(dev);
 	
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 	return 0;
 
 done:
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 	kfree(dmi1);
 	return err;
 }
@@ -204,7 +204,7 @@ done:
 
 void dev_mc_discard(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	
 	while (dev->mc_list != NULL) {
 		struct dev_mc_list *tmp = dev->mc_list;
@@ -215,7 +215,7 @@ void dev_mc_discard(struct net_device *dev)
 	}
 	dev->mc_count = 0;
 
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -250,7 +250,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v)
 	struct dev_mc_list *m;
 	struct net_device *dev = v;
 
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	for (m = dev->mc_list; m; m = m->next) {
 		int i;
 
@@ -262,7 +262,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v)
 
 		seq_putc(seq, '\n');
 	}
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 	return 0;
 }
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e8e05cebd95a..9cb781830380 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -273,24 +273,21 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 
 	do {
 		npinfo->tries--;
-		spin_lock(&np->dev->xmit_lock);
-		np->dev->xmit_lock_owner = smp_processor_id();
+		netif_tx_lock(np->dev);
 
 		/*
 		 * network drivers do not expect to be called if the queue is
 		 * stopped.
 		 */
 		if (netif_queue_stopped(np->dev)) {
-			np->dev->xmit_lock_owner = -1;
-			spin_unlock(&np->dev->xmit_lock);
+			netif_tx_unlock(np->dev);
 			netpoll_poll(np);
 			udelay(50);
 			continue;
 		}
 
 		status = np->dev->hard_start_xmit(skb, np->dev);
-		np->dev->xmit_lock_owner = -1;
-		spin_unlock(&np->dev->xmit_lock);
+		netif_tx_unlock(np->dev);
 
 		/* success */
 		if(!status) {
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c23e9c06ee23..67ed14ddabd2 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2897,7 +2897,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		}
 	}
 
-	spin_lock_bh(&odev->xmit_lock);
+	netif_tx_lock_bh(odev);
 	if (!netif_queue_stopped(odev)) {
 
 		atomic_inc(&(pkt_dev->skb->users));
@@ -2942,7 +2942,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		pkt_dev->next_tx_ns = 0;
 	}
 
-	spin_unlock_bh(&odev->xmit_lock);
+	netif_tx_unlock_bh(odev);
 
 	/* If pkt_dev->count is zero, then run forever */
 	if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 138ea92ed268..b1e4c5e20ac7 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -72,9 +72,9 @@ void qdisc_unlock_tree(struct net_device *dev)
    dev->queue_lock serializes queue accesses for this device
    AND dev->qdisc pointer itself.
 
-   dev->xmit_lock serializes accesses to device driver.
+   netif_tx_lock serializes accesses to device driver.
 
-   dev->queue_lock and dev->xmit_lock are mutually exclusive,
+   dev->queue_lock and netif_tx_lock are mutually exclusive,
    if one is grabbed, another must be free.
  */
 
@@ -108,7 +108,7 @@ int qdisc_restart(struct net_device *dev)
 		 * will be requeued.
 		 */
 		if (!nolock) {
-			if (!spin_trylock(&dev->xmit_lock)) {
+			if (!netif_tx_trylock(dev)) {
 			collision:
 				/* So, someone grabbed the driver. */
 				
@@ -126,8 +126,6 @@ int qdisc_restart(struct net_device *dev)
 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
 				goto requeue;
 			}
-			/* Remember that the driver is grabbed by us. */
-			dev->xmit_lock_owner = smp_processor_id();
 		}
 		
 		{
@@ -142,8 +140,7 @@ int qdisc_restart(struct net_device *dev)
 				ret = dev->hard_start_xmit(skb, dev);
 				if (ret == NETDEV_TX_OK) { 
 					if (!nolock) {
-						dev->xmit_lock_owner = -1;
-						spin_unlock(&dev->xmit_lock);
+						netif_tx_unlock(dev);
 					}
 					spin_lock(&dev->queue_lock);
 					return -1;
@@ -157,8 +154,7 @@ int qdisc_restart(struct net_device *dev)
 			/* NETDEV_TX_BUSY - we need to requeue */
 			/* Release the driver */
 			if (!nolock) { 
-				dev->xmit_lock_owner = -1;
-				spin_unlock(&dev->xmit_lock);
+				netif_tx_unlock(dev);
 			} 
 			spin_lock(&dev->queue_lock);
 			q = dev->qdisc;
@@ -187,7 +183,7 @@ static void dev_watchdog(unsigned long arg)
 {
 	struct net_device *dev = (struct net_device *)arg;
 
-	spin_lock(&dev->xmit_lock);
+	netif_tx_lock(dev);
 	if (dev->qdisc != &noop_qdisc) {
 		if (netif_device_present(dev) &&
 		    netif_running(dev) &&
@@ -203,7 +199,7 @@ static void dev_watchdog(unsigned long arg)
 				dev_hold(dev);
 		}
 	}
-	spin_unlock(&dev->xmit_lock);
+	netif_tx_unlock(dev);
 
 	dev_put(dev);
 }
@@ -227,17 +223,17 @@ void __netdev_watchdog_up(struct net_device *dev)
 
 static void dev_watchdog_up(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	__netdev_watchdog_up(dev);
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 }
 
 static void dev_watchdog_down(struct net_device *dev)
 {
-	spin_lock_bh(&dev->xmit_lock);
+	netif_tx_lock_bh(dev);
 	if (del_timer(&dev->watchdog_timer))
 		dev_put(dev);
-	spin_unlock_bh(&dev->xmit_lock);
+	netif_tx_unlock_bh(dev);
 }
 
 void netif_carrier_on(struct net_device *dev)
@@ -582,7 +578,7 @@ void dev_deactivate(struct net_device *dev)
 	while (test_bit(__LINK_STATE_SCHED, &dev->state))
 		yield();
 
-	spin_unlock_wait(&dev->xmit_lock);
+	spin_unlock_wait(&dev->_xmit_lock);
 }
 
 void dev_init_scheduler(struct net_device *dev)
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 79b8ef34c6e4..4c16ad57a3e4 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -302,20 +302,17 @@ restart:
 
 		switch (teql_resolve(skb, skb_res, slave)) {
 		case 0:
-			if (spin_trylock(&slave->xmit_lock)) {
-				slave->xmit_lock_owner = smp_processor_id();
+			if (netif_tx_trylock(slave)) {
 				if (!netif_queue_stopped(slave) &&
 				    slave->hard_start_xmit(skb, slave) == 0) {
-					slave->xmit_lock_owner = -1;
-					spin_unlock(&slave->xmit_lock);
+					netif_tx_unlock(slave);
 					master->slaves = NEXT_SLAVE(q);
 					netif_wake_queue(dev);
 					master->stats.tx_packets++;
 					master->stats.tx_bytes += len;
 					return 0;
 				}
-				slave->xmit_lock_owner = -1;
-				spin_unlock(&slave->xmit_lock);
+				netif_tx_unlock(slave);
 			}
 			if (netif_queue_stopped(dev))
 				busy = 1;
-- 
cgit v1.2.3


From 364c6badde0dd62a0a38e5ed67f85d87d6665780 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 9 Jun 2006 16:10:40 -0700
Subject: [NET]: Clean up skb_linearize

The linearisation operation doesn't need to be super-optimised.  So we can
replace __skb_linearize with __pskb_pull_tail which does the same thing but
is more general.

Also, most users of skb_linearize end up testing whether the skb is linear
or not so it helps to make skb_linearize do just that.

Some callers of skb_linearize also use it to copy cloned data, so it's
useful to have a new function skb_linearize_cow to copy the data if it's
either non-linear or cloned.

Last but not least, I've removed the gfp argument since nobody uses it
anymore.  If it's ever needed we can easily add it back.

Misc bugs fixed by this patch:

* via-velocity error handling (also, no SG => no frags)

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/block/aoe/aoenet.c |  3 +--
 drivers/net/mv643xx_eth.c  |  2 +-
 drivers/net/via-velocity.c | 10 +++++---
 include/linux/skbuff.h     | 24 +++++++++++++++---
 net/core/dev.c             | 63 ++--------------------------------------------
 net/decnet/dn_nsp_in.c     |  3 +--
 net/decnet/dn_route.c      |  3 +--
 net/ipv4/ipcomp.c          | 11 +++-----
 net/ipv6/ipcomp6.c         | 11 +++-----
 9 files changed, 39 insertions(+), 91 deletions(-)

(limited to 'include')

diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index fdff774b8ab9..c1434ed11880 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -116,8 +116,7 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
 	skb = skb_share_check(skb, GFP_ATOMIC);
 	if (skb == NULL)
 		return 0;
-	if (skb_is_nonlinear(skb))
-	if (skb_linearize(skb, GFP_ATOMIC) < 0)
+	if (skb_linearize(skb))
 		goto exit;
 	if (!is_aoe_netif(ifp))
 		goto exit;
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 411f4d809c47..625ff61c9988 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -1200,7 +1200,7 @@ static int mv643xx_eth_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (has_tiny_unaligned_frags(skb)) {
-		if ((skb_linearize(skb, GFP_ATOMIC) != 0)) {
+		if (__skb_linearize(skb)) {
 			stats->tx_dropped++;
 			printk(KERN_DEBUG "%s: failed to linearize tiny "
 					"unaligned fragment\n", dev->name);
diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c
index ed1f837c8fda..2eb6b5f9ba0d 100644
--- a/drivers/net/via-velocity.c
+++ b/drivers/net/via-velocity.c
@@ -1899,6 +1899,13 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	int pktlen = skb->len;
 
+#ifdef VELOCITY_ZERO_COPY_SUPPORT
+	if (skb_shinfo(skb)->nr_frags > 6 && __skb_linearize(skb)) {
+		kfree_skb(skb);
+		return 0;
+	}
+#endif
+
 	spin_lock_irqsave(&vptr->lock, flags);
 
 	index = vptr->td_curr[qnum];
@@ -1914,8 +1921,6 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev)
 	 */
 	if (pktlen < ETH_ZLEN) {
 		/* Cannot occur until ZC support */
-		if(skb_linearize(skb, GFP_ATOMIC))
-			return 0; 
 		pktlen = ETH_ZLEN;
 		memcpy(tdinfo->buf, skb->data, skb->len);
 		memset(tdinfo->buf + skb->len, 0, ETH_ZLEN - skb->len);
@@ -1933,7 +1938,6 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev)
 		int nfrags = skb_shinfo(skb)->nr_frags;
 		tdinfo->skb = skb;
 		if (nfrags > 6) {
-			skb_linearize(skb, GFP_ATOMIC);
 			memcpy(tdinfo->buf, skb->data, skb->len);
 			tdinfo->skb_dma[0] = tdinfo->buf_dma;
 			td_ptr->tdesc0.pktsize = 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe2c58e5306f..830f58fa03a2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1169,18 +1169,34 @@ static inline int skb_can_coalesce(struct sk_buff *skb, int i,
 	return 0;
 }
 
+static inline int __skb_linearize(struct sk_buff *skb)
+{
+	return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
+}
+
 /**
  *	skb_linearize - convert paged skb to linear one
  *	@skb: buffer to linarize
- *	@gfp: allocation mode
  *
  *	If there is no free memory -ENOMEM is returned, otherwise zero
  *	is returned and the old skb data released.
  */
-extern int __skb_linearize(struct sk_buff *skb, gfp_t gfp);
-static inline int skb_linearize(struct sk_buff *skb, gfp_t gfp)
+static inline int skb_linearize(struct sk_buff *skb)
+{
+	return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
+}
+
+/**
+ *	skb_linearize_cow - make sure skb is linear and writable
+ *	@skb: buffer to process
+ *
+ *	If there is no free memory -ENOMEM is returned, otherwise zero
+ *	is returned and the old skb data released.
+ */
+static inline int skb_linearize_cow(struct sk_buff *skb)
 {
-	return __skb_linearize(skb, gfp);
+	return skb_is_nonlinear(skb) || skb_cloned(skb) ?
+	       __skb_linearize(skb) : 0;
 }
 
 /**
diff --git a/net/core/dev.c b/net/core/dev.c
index 1b09f1cae46e..91361bc2b682 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1222,64 +1222,6 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 #define illegal_highdma(dev, skb)	(0)
 #endif
 
-/* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
-{
-	unsigned int size;
-	u8 *data;
-	long offset;
-	struct skb_shared_info *ninfo;
-	int headerlen = skb->data - skb->head;
-	int expand = (skb->tail + skb->data_len) - skb->end;
-
-	if (skb_shared(skb))
-		BUG();
-
-	if (expand <= 0)
-		expand = 0;
-
-	size = skb->end - skb->head + expand;
-	size = SKB_DATA_ALIGN(size);
-	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
-	if (!data)
-		return -ENOMEM;
-
-	/* Copy entire thing */
-	if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
-		BUG();
-
-	/* Set up shinfo */
-	ninfo = (struct skb_shared_info*)(data + size);
-	atomic_set(&ninfo->dataref, 1);
-	ninfo->tso_size = skb_shinfo(skb)->tso_size;
-	ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
-	ninfo->nr_frags = 0;
-	ninfo->frag_list = NULL;
-
-	/* Offset between the two in bytes */
-	offset = data - skb->head;
-
-	/* Free old data. */
-	skb_release_data(skb);
-
-	skb->head = data;
-	skb->end  = data + size;
-
-	/* Set up new pointers */
-	skb->h.raw   += offset;
-	skb->nh.raw  += offset;
-	skb->mac.raw += offset;
-	skb->tail    += offset;
-	skb->data    += offset;
-
-	/* We are no longer a clone, even if we were. */
-	skb->cloned    = 0;
-
-	skb->tail     += skb->data_len;
-	skb->data_len  = 0;
-	return 0;
-}
-
 #define HARD_TX_LOCK(dev, cpu) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
 		netif_tx_lock(dev);			\
@@ -1326,7 +1268,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 
 	if (skb_shinfo(skb)->frag_list &&
 	    !(dev->features & NETIF_F_FRAGLIST) &&
-	    __skb_linearize(skb, GFP_ATOMIC))
+	    __skb_linearize(skb))
 		goto out_kfree_skb;
 
 	/* Fragmented skb is linearized if device does not support SG,
@@ -1335,7 +1277,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 	 */
 	if (skb_shinfo(skb)->nr_frags &&
 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
-	    __skb_linearize(skb, GFP_ATOMIC))
+	    __skb_linearize(skb))
 		goto out_kfree_skb;
 
 	/* If packet is not checksummed and device does not support
@@ -3473,7 +3415,6 @@ subsys_initcall(net_dev_init);
 EXPORT_SYMBOL(__dev_get_by_index);
 EXPORT_SYMBOL(__dev_get_by_name);
 EXPORT_SYMBOL(__dev_remove_pack);
-EXPORT_SYMBOL(__skb_linearize);
 EXPORT_SYMBOL(dev_valid_name);
 EXPORT_SYMBOL(dev_add_pack);
 EXPORT_SYMBOL(dev_alloc_name);
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 547523b41c81..a2ba9db1c376 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -801,8 +801,7 @@ got_it:
 		 * We linearize everything except data segments here.
 		 */
 		if (cb->nsp_flags & ~0x60) {
-			if (unlikely(skb_is_nonlinear(skb)) &&
-			    skb_linearize(skb, GFP_ATOMIC) != 0)
+			if (unlikely(skb_linearize(skb)))
 				goto free_out;
 		}
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index e172cf98d7fc..5abf7057af00 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -629,8 +629,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type
 			padlen);
 
         if (flags & DN_RT_PKT_CNTL) {
-		if (unlikely(skb_is_nonlinear(skb)) &&
-		    skb_linearize(skb, GFP_ATOMIC) != 0)
+		if (unlikely(skb_linearize(skb)))
 			goto dump_it;
 
                 switch(flags & DN_RT_CNTL_MSK) {
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 8e243589045f..3ed8b57a1002 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -80,15 +80,12 @@ out:
 
 static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-	int err = 0;
+	int err = -ENOMEM;
 	struct iphdr *iph;
 	struct ip_comp_hdr *ipch;
 
-	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
-	    skb_linearize(skb, GFP_ATOMIC) != 0) {
-	    	err = -ENOMEM;
+	if (skb_linearize_cow(skb))
 	    	goto out;
-	}
 
 	skb->ip_summed = CHECKSUM_NONE;
 
@@ -158,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
 		goto out_ok;
 	}
 
-	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
-	    skb_linearize(skb, GFP_ATOMIC) != 0) {
+	if (skb_linearize_cow(skb))
 		goto out_ok;
-	}
 	
 	err = ipcomp_compress(x, skb);
 	iph = skb->nh.iph;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index cec3be544b69..f28cd37feed3 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -65,7 +65,7 @@ static LIST_HEAD(ipcomp6_tfms_list);
 
 static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-	int err = 0;
+	int err = -ENOMEM;
 	struct ipv6hdr *iph;
 	struct ipv6_comp_hdr *ipch;
 	int plen, dlen;
@@ -74,11 +74,8 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb)
 	struct crypto_tfm *tfm;
 	int cpu;
 
-	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
-		skb_linearize(skb, GFP_ATOMIC) != 0) {
-		err = -ENOMEM;
+	if (skb_linearize_cow(skb))
 		goto out;
-	}
 
 	skb->ip_summed = CHECKSUM_NONE;
 
@@ -142,10 +139,8 @@ static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb)
 		goto out_ok;
 	}
 
-	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
-		skb_linearize(skb, GFP_ATOMIC) != 0) {
+	if (skb_linearize_cow(skb))
 		goto out_ok;
-	}
 
 	/* compression */
 	plen = skb->len - hdr_len;
-- 
cgit v1.2.3


From b38dfee3d616ffadb58d4215e3ff9d1d7921031e Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 9 Jun 2006 16:13:01 -0700
Subject: [NET]: skb_trim audit

I found a few more spots where pskb_trim_rcsum could be used but were not.
This patch changes them to use it.

Also, sk_filter can get paged skb data.  Therefore we must use pskb_trim
instead of skb_trim.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h                      |  5 +----
 net/bridge/br_netfilter.c               | 14 +++-----------
 net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++-------
 3 files changed, 7 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 75b0e97ed93d..96565ff0de6a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -873,10 +873,7 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock)
 		if (filter) {
 			unsigned int pkt_len = sk_run_filter(skb, filter->insns,
 							     filter->len);
-			if (!pkt_len)
-				err = -EPERM;
-			else
-				skb_trim(skb, pkt_len);
+			err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
 		}
 
 		if (needlock)
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 3da9264449f7..3e41f9d6d51c 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -407,12 +407,8 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
 	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
 		if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
 			goto inhdr_error;
-		if (pkt_len + sizeof(struct ipv6hdr) < skb->len) {
-			if (__pskb_trim(skb, pkt_len + sizeof(struct ipv6hdr)))
-				goto inhdr_error;
-			if (skb->ip_summed == CHECKSUM_HW)
-				skb->ip_summed = CHECKSUM_NONE;
-		}
+		if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
+			goto inhdr_error;
 	}
 	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
 		goto inhdr_error;
@@ -495,11 +491,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
 	if (skb->len < len || len < 4 * iph->ihl)
 		goto inhdr_error;
 
-	if (skb->len > len) {
-		__pskb_trim(skb, len);
-		if (skb->ip_summed == CHECKSUM_HW)
-			skb->ip_summed = CHECKSUM_NONE;
-	}
+	pskb_trim_rcsum(skb, len);
 
 	nf_bridge_put(skb->nf_bridge);
 	if (!nf_bridge_alloc(skb))
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3e319035f82d..c32a029e43f0 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -456,13 +456,9 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 		DEBUGP("queue: message is too short.\n");
 		goto err;
 	}
-	if (end-offset < skb->len) {
-		if (pskb_trim(skb, end - offset)) {
-			DEBUGP("Can't trim\n");
-			goto err;
-		}
-		if (skb->ip_summed != CHECKSUM_UNNECESSARY)
-			skb->ip_summed = CHECKSUM_NONE;
+	if (pskb_trim_rcsum(skb, end - offset)) {
+		DEBUGP("Can't trim\n");
+		goto err;
 	}
 
 	/* Find out which fragments are in front and at the back of us
-- 
cgit v1.2.3


From 3cc0e873986fe594d0e96d07259b11f755325cb2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 9 Jun 2006 16:13:38 -0700
Subject: [NET]: Warn in __skb_trim if skb is paged

It's better to warn and fail rather than rarely triggering BUG on paths
that incorrectly call skb_trim/__skb_trim on a non-linear skb.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 24 ++++++++++++------------
 net/core/skbuff.c      |  7 ++-----
 2 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 830f58fa03a2..93e4db221585 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -975,15 +975,16 @@ static inline void skb_reserve(struct sk_buff *skb, int len)
 #define NET_SKB_PAD	16
 #endif
 
-extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc);
+extern int ___pskb_trim(struct sk_buff *skb, unsigned int len);
 
 static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
 {
-	if (!skb->data_len) {
-		skb->len  = len;
-		skb->tail = skb->data + len;
-	} else
-		___pskb_trim(skb, len, 0);
+	if (unlikely(skb->data_len)) {
+		WARN_ON(1);
+		return;
+	}
+	skb->len  = len;
+	skb->tail = skb->data + len;
 }
 
 /**
@@ -993,6 +994,7 @@ static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
  *
  *	Cut the length of a buffer down by removing data from the tail. If
  *	the buffer is already under the length specified it is not modified.
+ *	The skb must be linear.
  */
 static inline void skb_trim(struct sk_buff *skb, unsigned int len)
 {
@@ -1003,12 +1005,10 @@ static inline void skb_trim(struct sk_buff *skb, unsigned int len)
 
 static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
 {
-	if (!skb->data_len) {
-		skb->len  = len;
-		skb->tail = skb->data+len;
-		return 0;
-	}
-	return ___pskb_trim(skb, len, 1);
+	if (skb->data_len)
+		return ___pskb_trim(skb, len);
+	__skb_trim(skb, len);
+	return 0;
 }
 
 static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 96cdcbe24ba2..bb7210f4005e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -801,12 +801,10 @@ struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
 	return nskb;
 }	
  
-/* Trims skb to length len. It can change skb pointers, if "realloc" is 1.
- * If realloc==0 and trimming is impossible without change of data,
- * it is BUG().
+/* Trims skb to length len. It can change skb pointers.
  */
 
-int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc)
+int ___pskb_trim(struct sk_buff *skb, unsigned int len)
 {
 	int offset = skb_headlen(skb);
 	int nfrags = skb_shinfo(skb)->nr_frags;
@@ -816,7 +814,6 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc)
 		int end = offset + skb_shinfo(skb)->frags[i].size;
 		if (end > len) {
 			if (skb_cloned(skb)) {
-				BUG_ON(!realloc);
 				if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
 					return -ENOMEM;
 			}
-- 
cgit v1.2.3


From bdeb04c6d9a957ae2a51c3033414467b82b2a736 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Sun, 11 Jun 2006 21:20:38 -0700
Subject: [NET]: net.ipv4.ip_autoconfig sysctl removal

The sysctl net.ipv4.ip_autoconfig is a legacy value that is not used.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h           | 1 -
 net/ipv4/sysctl_net_ipv4.c | 8 --------
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 3d2e5ca62a5a..ead233c9540d 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -147,7 +147,6 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 struct ipv4_config
 {
 	int	log_martians;
-	int	autoconfig;
 	int	no_pmtu_disc;
 };
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6a6aa537b7aa..e7fb665873c6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -181,14 +181,6 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &ipv4_doint_and_flush,
 		.strategy	= &ipv4_doint_and_flush_strategy,
 	},
-        {
-		.ctl_name	= NET_IPV4_AUTOCONFIG,
-		.procname	= "ip_autoconfig",
-		.data		= &ipv4_config.autoconfig,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
         {
 		.ctl_name	= NET_IPV4_NO_PMTU_DISC,
 		.procname	= "ip_no_pmtu_disc",
-- 
cgit v1.2.3


From 35089bb203f44e33b6bbb6c4de0b0708f9a48921 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Tue, 13 Jun 2006 22:33:04 -0700
Subject: [TCP]: Add tcp_slow_start_after_idle sysctl.

A lot of people have asked for a way to disable tcp_cwnd_restart(),
and it seems reasonable to add a sysctl to do that.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 7 +++++++
 include/linux/sysctl.h                 | 1 +
 include/net/tcp.h                      | 1 +
 net/ipv4/sysctl_net_ipv4.c             | 8 ++++++++
 net/ipv4/tcp_output.c                  | 6 +++++-
 5 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f12007b80a46..d46338af6002 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -362,6 +362,13 @@ tcp_workaround_signed_windows - BOOLEAN
 	not receive a window scaling option from them.
 	Default: 0
 
+tcp_slow_start_after_idle - BOOLEAN
+	If set, provide RFC2861 behavior and time out the congestion
+	window after an idle period.  An idle period is defined at
+	the current RTO.  If unset, the congestion window will not
+	be timed out after an idle period.
+	Default: 1
+
 IP Variables:
 
 ip_local_port_range - 2 INTEGERS
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 98338ed2c0b6..cee944dbdcd4 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -405,6 +405,7 @@ enum
 	NET_TCP_BASE_MSS=114,
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
 	NET_TCP_DMA_COPYBREAK=116,
+	NET_TCP_SLOW_START_AFTER_IDLE=117,
 };
 
 enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index de88c5472bfc..bfc71f954bbe 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -227,6 +227,7 @@ extern int sysctl_tcp_abc;
 extern int sysctl_tcp_mtu_probing;
 extern int sysctl_tcp_base_mss;
 extern int sysctl_tcp_workaround_signed_windows;
+extern int sysctl_tcp_slow_start_after_idle;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e7fb665873c6..ce4cd5f35511 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -690,6 +690,14 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &proc_dointvec
 	},
 #endif
+	{
+		.ctl_name	= NET_TCP_SLOW_START_AFTER_IDLE,
+		.procname	= "tcp_slow_start_after_idle",
+		.data		= &sysctl_tcp_slow_start_after_idle,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f33c9dddaa12..07bb5a2b375e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3;
 int sysctl_tcp_mtu_probing = 0;
 int sysctl_tcp_base_mss = 512;
 
+/* By default, RFC2861 behavior.  */
+int sysctl_tcp_slow_start_after_idle = 1;
+
 static void update_send_head(struct sock *sk, struct tcp_sock *tp,
 			     struct sk_buff *skb)
 {
@@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const u32 now = tcp_time_stamp;
 
-	if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+	if (sysctl_tcp_slow_start_after_idle &&
+	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
 		tcp_cwnd_restart(sk, __sk_dst_get(sk));
 
 	tp->lsndtime = now;
-- 
cgit v1.2.3


From 8648b3053bff39a7ee4c711d74268079c928a657 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 17 Jun 2006 22:06:05 -0700
Subject: [NET]: Add NETIF_F_GEN_CSUM and NETIF_F_ALL_CSUM

The current stack treats NETIF_F_HW_CSUM and NETIF_F_NO_CSUM
identically so we test for them in quite a few places.  For the sake
of brevity, I'm adding the macro NETIF_F_GEN_CSUM for these two.  We
also test the disjunct of NETIF_F_IP_CSUM and the other two in various
places, for that purpose I've added NETIF_F_ALL_CSUM.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  7 ++-----
 include/linux/netdevice.h       |  3 +++
 net/bridge/br_if.c              |  3 +--
 net/core/dev.c                  |  6 ++----
 net/core/ethtool.c              |  6 ++----
 net/ipv4/ip_output.c            |  2 +-
 net/ipv4/tcp.c                  | 10 ++--------
 7 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 46326cdfb277..8171cae06688 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1199,8 +1199,7 @@ int bond_sethwaddr(struct net_device *bond_dev, struct net_device *slave_dev)
 }
 
 #define BOND_INTERSECT_FEATURES \
-	(NETIF_F_SG|NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM|\
-	NETIF_F_TSO|NETIF_F_UFO)
+	(NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_TSO | NETIF_F_UFO)
 
 /* 
  * Compute the common dev->feature set available to all slaves.  Some
@@ -1218,9 +1217,7 @@ static int bond_compute_features(struct bonding *bond)
 		features &= (slave->dev->features & BOND_INTERSECT_FEATURES);
 
 	if ((features & NETIF_F_SG) && 
-	    !(features & (NETIF_F_IP_CSUM |
-			  NETIF_F_NO_CSUM |
-			  NETIF_F_HW_CSUM)))
+	    !(features & NETIF_F_ALL_CSUM))
 		features &= ~NETIF_F_SG;
 
 	/* 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 067b9ccafd87..e432b743dda2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -312,6 +312,9 @@ struct net_device
 #define NETIF_F_LLTX		4096	/* LockLess TX */
 #define NETIF_F_UFO             8192    /* Can offload UDP Large Send*/
 
+#define NETIF_F_GEN_CSUM	(NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
+#define NETIF_F_ALL_CSUM	(NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
+
 	struct net_device	*next_sched;
 
 	/* Interface index. Unique device identifier	*/
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f5d47bf4f967..90c95f59dc82 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -376,8 +376,7 @@ void br_features_recompute(struct net_bridge *br)
 	checksum = br->feature_mask & NETIF_F_IP_CSUM;
 
 	list_for_each_entry(p, &br->port_list, list) {
-		if (!(p->dev->features 
-		      & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)))
+		if (!(p->dev->features & NETIF_F_ALL_CSUM))
 			checksum = 0;
 		features &= p->dev->features;
 	}
diff --git a/net/core/dev.c b/net/core/dev.c
index 91361bc2b682..ab39fe17cb58 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1284,7 +1284,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 	 * checksumming for this protocol, complete checksumming here.
 	 */
 	if (skb->ip_summed == CHECKSUM_HW &&
-	    (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
+	    (!(dev->features & NETIF_F_GEN_CSUM) &&
 	     (!(dev->features & NETIF_F_IP_CSUM) ||
 	      skb->protocol != htons(ETH_P_IP))))
 	      	if (skb_checksum_help(skb, 0))
@@ -2789,9 +2789,7 @@ int register_netdevice(struct net_device *dev)
 
 	/* Fix illegal SG+CSUM combinations. */
 	if ((dev->features & NETIF_F_SG) &&
-	    !(dev->features & (NETIF_F_IP_CSUM |
-			       NETIF_F_NO_CSUM |
-			       NETIF_F_HW_CSUM))) {
+	    !(dev->features & NETIF_F_ALL_CSUM)) {
 		printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
 		       dev->name);
 		dev->features &= ~NETIF_F_SG;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e6f76106a99b..3f269e4e9438 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -30,7 +30,7 @@ u32 ethtool_op_get_link(struct net_device *dev)
 
 u32 ethtool_op_get_tx_csum(struct net_device *dev)
 {
-	return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0;
+	return (dev->features & NETIF_F_ALL_CSUM) != 0;
 }
 
 int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
@@ -551,9 +551,7 @@ static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
 		return -EFAULT;
 
 	if (edata.data && 
-	    !(dev->features & (NETIF_F_IP_CSUM |
-			       NETIF_F_NO_CSUM |
-			       NETIF_F_HW_CSUM)))
+	    !(dev->features & NETIF_F_ALL_CSUM))
 		return -EINVAL;
 
 	return __ethtool_set_sg(dev, edata.data);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d4bb3fae4e49..8538aac3d148 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -840,7 +840,7 @@ int ip_append_data(struct sock *sk,
 	 */
 	if (transhdrlen &&
 	    length + fragheaderlen <= mtu &&
-	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
+	    rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
 	    !exthdrlen)
 		csummode = CHECKSUM_HW;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ff6ccda9ff46..74998f250071 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -622,14 +622,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 	ssize_t res;
 	struct sock *sk = sock->sk;
 
-#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
-
 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
-	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
+	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
 		return sock_no_sendpage(sock, page, offset, size, flags);
 
-#undef TCP_ZC_CSUM_FLAGS
-
 	lock_sock(sk);
 	TCP_CHECK_TIMER(sk);
 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
@@ -726,9 +722,7 @@ new_segment:
 				/*
 				 * Check whether we can use HW checksum.
 				 */
-				if (sk->sk_route_caps &
-				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
-				     NETIF_F_HW_CSUM))
+				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
 					skb->ip_summed = CHECKSUM_HW;
 
 				skb_entail(sk, tp, skb);
-- 
cgit v1.2.3


From c7ce1ae21223fe1f905feba272bc14b87994a57d Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tgohad@mvista.com>
Date: Sat, 17 Jun 2006 22:54:03 -0700
Subject: [PFKEYV2]: Fix inconsistent typing in struct sadb_x_kmprivate.

Signed-off-by: Tushar Gohad <tgohad@mvista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pfkeyv2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h
index bac0fb389cf1..d5dd471da225 100644
--- a/include/linux/pfkeyv2.h
+++ b/include/linux/pfkeyv2.h
@@ -159,7 +159,7 @@ struct sadb_spirange {
 struct sadb_x_kmprivate {
 	uint16_t	sadb_x_kmprivate_len;
 	uint16_t	sadb_x_kmprivate_exttype;
-	u_int32_t	sadb_x_kmprivate_reserved;
+	uint32_t	sadb_x_kmprivate_reserved;
 } __attribute__((packed));
 /* sizeof(struct sadb_x_kmprivate) == 8 */
 
-- 
cgit v1.2.3


From 5636bef7324f49e36f05ec8a5f6284e11b1bcca4 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Sat, 17 Jun 2006 22:55:35 -0700
Subject: [SCTP]: Reject sctp packets with broadcast addresses.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 3 ++-
 net/sctp/input.c           | 3 ++-
 net/sctp/ipv6.c            | 6 ++++--
 net/sctp/protocol.c        | 8 +++++++-
 net/sctp/socket.c          | 2 +-
 5 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 7f4fea173fb1..5f69158c1006 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -555,7 +555,8 @@ struct sctp_af {
 	int		(*to_addr_param) (const union sctp_addr *,
 					  union sctp_addr_param *); 
 	int		(*addr_valid)	(union sctp_addr *,
-					 struct sctp_sock *);
+					 struct sctp_sock *,
+					 const struct sk_buff *);
 	sctp_scope_t	(*scope) (union sctp_addr *);
 	void		(*inaddr_any)	(union sctp_addr *, unsigned short);
 	int		(*is_any)	(const union sctp_addr *);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 1662f9cc869e..70d6606e2812 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -170,7 +170,8 @@ int sctp_rcv(struct sk_buff *skb)
 	 * IP broadcast addresses cannot be used in an SCTP transport
 	 * address."
 	 */
-	if (!af->addr_valid(&src, NULL) || !af->addr_valid(&dest, NULL))
+	if (!af->addr_valid(&src, NULL, skb) ||
+	    !af->addr_valid(&dest, NULL, skb))
 		goto discard_it;
 
 	asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c20d282fac06..8ef08070c8b6 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -523,7 +523,9 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
  * Return 0 - If the address is a non-unicast or an illegal address.
  * Return 1 - If the address is a unicast.
  */
-static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp)
+static int sctp_v6_addr_valid(union sctp_addr *addr,
+			      struct sctp_sock *sp,
+			      const struct sk_buff *skb)
 {
 	int ret = ipv6_addr_type(&addr->v6.sin6_addr);
 
@@ -537,7 +539,7 @@ static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp)
 		if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
 			return 0;
 		sctp_v6_map_v4(addr);
-		return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp);
+		return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb);
 	}
 
 	/* Is this a non-unicast address */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2088aa992b7a..816c033d7886 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -365,12 +365,18 @@ static int sctp_v4_is_any(const union sctp_addr *addr)
  * Return 0 - If the address is a non-unicast or an illegal address.
  * Return 1 - If the address is a unicast.
  */
-static int sctp_v4_addr_valid(union sctp_addr *addr, struct sctp_sock *sp)
+static int sctp_v4_addr_valid(union sctp_addr *addr,
+			      struct sctp_sock *sp,
+			      const struct sk_buff *skb)
 {
 	/* Is this a non-unicast address or a unusable SCTP address? */
 	if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr))
 		return 0;
 
+ 	/* Is this a broadcast address? */
+ 	if (skb && ((struct rtable *)skb->dst)->rt_flags & RTCF_BROADCAST)
+ 		return 0;
+
 	return 1;
 }
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b41dcbb89685..b811691c35bf 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -172,7 +172,7 @@ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
 		return -EINVAL;
 
 	/* Is this a valid SCTP address?  */
-	if (!af->addr_valid(addr, sctp_sk(sk)))
+	if (!af->addr_valid(addr, sctp_sk(sk), NULL))
 		return -EINVAL;
 
 	if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr)))
-- 
cgit v1.2.3


From 4c9f5d5305a23851e67471b147e0d459a7166717 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Sat, 17 Jun 2006 22:56:08 -0700
Subject: [SCTP] Reset rtt_in_progress for the chunk when processing its sack.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 2 +-
 net/sctp/outqueue.c     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index aa6033ca7cd8..b2b40f951ae6 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -255,7 +255,7 @@ extern int sctp_debug_flag;
 #define SCTP_DEBUG_PRINTK_IPADDR(whatever...)
 #define SCTP_ENABLE_DEBUG
 #define SCTP_DISABLE_DEBUG
-#define SCTP_ASSERT(expr, str, func)
+#define SCTP_ASSERT(expr, str, func) BUG_ON(!(expr))
 
 #endif /* SCTP_DEBUG */
 
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index f148f9576dd2..e5faa351aaad 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1262,6 +1262,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 			   	if (!tchunk->tsn_gap_acked &&
 				    !tchunk->resent &&
 				    tchunk->rtt_in_progress) {
+					tchunk->rtt_in_progress = 0;
 					rtt = jiffies - tchunk->sent_at;
 					sctp_transport_update_rto(transport,
 								  rtt);
-- 
cgit v1.2.3