diff options
author | Jeff Garzik <jgarzik@pobox.com> | 2006-01-17 16:29:06 +0100 |
---|---|---|
committer | Jeff Garzik <jgarzik@pobox.com> | 2006-01-17 16:29:06 +0100 |
commit | ea9b395fe20ac74be788f415af2622ac8f0c35c7 (patch) | |
tree | d1653e1a4cbe360aa7132ea4e29ab92a02038224 /net | |
parent | Merge branch 'upstream' (diff) | |
parent | [PATCH] libata: add a function to decide if we need iordy (diff) | |
download | linux-ea9b395fe20ac74be788f415af2622ac8f0c35c7.tar.xz linux-ea9b395fe20ac74be788f415af2622ac8f0c35c7.zip |
Merge branch 'upstream'
Diffstat (limited to 'net')
429 files changed, 30414 insertions, 7123 deletions
diff --git a/net/802/Makefile b/net/802/Makefile index 01861929591a..977704a54f68 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -2,8 +2,6 @@ # Makefile for the Linux 802.x protocol layers. # -obj-y := p8023.o - # Check the p8022 selections against net/core/Makefile. obj-$(CONFIG_SYSCTL) += sysctl_net_802.o obj-$(CONFIG_LLC) += p8022.o psnap.o @@ -11,5 +9,5 @@ obj-$(CONFIG_TR) += p8022.o psnap.o tr.o sysctl_net_802.o obj-$(CONFIG_NET_FC) += fc.o obj-$(CONFIG_FDDI) += fddi.o obj-$(CONFIG_HIPPI) += hippi.o -obj-$(CONFIG_IPX) += p8022.o psnap.o +obj-$(CONFIG_IPX) += p8022.o psnap.o p8023.o obj-$(CONFIG_ATALK) += p8022.o psnap.o diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 67465b65abe4..fa76220708ce 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -19,6 +19,7 @@ */ #include <asm/uaccess.h> /* for copy_from_user */ +#include <linux/capability.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index f2a8750bbf1d..0f604d227da2 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -214,7 +214,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, * This allows the VLAN to have a different MAC than the underlying * device, and still route correctly. */ - if (memcmp(eth_hdr(skb)->h_dest, skb->dev->dev_addr, ETH_ALEN) == 0) { + if (!compare_ether_addr(eth_hdr(skb)->h_dest, skb->dev->dev_addr)) { /* It is for our (changed) MAC-address! */ skb->pkt_type = PACKET_HOST; } diff --git a/net/Kconfig b/net/Kconfig index 60f6f321bd76..9296b269d675 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -159,6 +159,7 @@ source "net/ipx/Kconfig" source "drivers/net/appletalk/Kconfig" source "net/x25/Kconfig" source "net/lapb/Kconfig" +source "net/tipc/Kconfig" config NET_DIVERT bool "Frame Diverter (EXPERIMENTAL)" diff --git a/net/Makefile b/net/Makefile index f5141b9d4f38..065796f5fb17 100644 --- a/net/Makefile +++ b/net/Makefile @@ -45,6 +45,7 @@ obj-$(CONFIG_VLAN_8021Q) += 8021q/ obj-$(CONFIG_IP_DCCP) += dccp/ obj-$(CONFIG_IP_SCTP) += sctp/ obj-$(CONFIG_IEEE80211) += ieee80211/ +obj-$(CONFIG_TIPC) += tipc/ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_SYSCTL) += sysctl_net.o diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 7982656b9c83..697ac55e29dc 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -52,6 +52,7 @@ */ #include <linux/config.h> +#include <linux/capability.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/termios.h> /* For TIOCOUTQ/INQ */ @@ -63,7 +64,7 @@ #include <linux/atalk.h> struct datalink_proto *ddp_dl, *aarp_dl; -static struct proto_ops atalk_dgram_ops; +static const struct proto_ops atalk_dgram_ops; /**************************************************************************\ * * @@ -1763,7 +1764,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr */ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - int rc = -EINVAL; + int rc = -ENOIOCTLCMD; struct sock *sk = sock->sk; void __user *argp = (void __user *)arg; @@ -1813,23 +1814,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = atif_ioctl(cmd, argp); rtnl_unlock(); break; - /* Physical layer ioctl calls */ - case SIOCSIFLINK: - case SIOCGIFHWADDR: - case SIOCSIFHWADDR: - case SIOCGIFFLAGS: - case SIOCSIFFLAGS: - case SIOCGIFTXQLEN: - case SIOCSIFTXQLEN: - case SIOCGIFMTU: - case SIOCGIFCONF: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCGIFCOUNT: - case SIOCGIFINDEX: - case SIOCGIFNAME: - rc = dev_ioctl(cmd, argp); - break; } return rc; @@ -1841,7 +1825,7 @@ static struct net_proto_family atalk_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { .family = PF_APPLETALK, .owner = THIS_MODULE, .release = atalk_release, diff --git a/net/atm/br2684.c b/net/atm/br2684.c index 72f3f7b8de80..680ccb12aae8 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -18,6 +18,7 @@ Author: Marcell GAL, 2000, XDSL Ltd, Hungary #include <net/arp.h> #include <linux/atm.h> #include <linux/atmdev.h> +#include <linux/capability.h> #include <linux/seq_file.h> #include <linux/atmbr2684.h> @@ -295,14 +296,14 @@ static inline __be16 br_type_trans(struct sk_buff *skb, struct net_device *dev) unsigned char *rawp; eth = eth_hdr(skb); - if (*eth->h_dest & 1) { - if (memcmp(eth->h_dest, dev->broadcast, ETH_ALEN) == 0) + if (is_multicast_ether_addr(eth->h_dest)) { + if (!compare_ether_addr(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } - else if (memcmp(eth->h_dest, dev->dev_addr, ETH_ALEN)) + else if (compare_ether_addr(eth->h_dest, dev->dev_addr)) skb->pkt_type = PACKET_OTHERHOST; if (ntohs(eth->h_proto) >= 1536) diff --git a/net/atm/clip.c b/net/atm/clip.c index 4f54c9a5e84a..73370de97539 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -19,6 +19,7 @@ #include <linux/atmdev.h> #include <linux/atmclip.h> #include <linux/atmarp.h> +#include <linux/capability.h> #include <linux/ip.h> /* for net/route.h */ #include <linux/in.h> /* for struct sockaddr_in */ #include <linux/if.h> /* for IFF_UP */ diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index a150198b05a3..eb109af7eb4a 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -12,6 +12,7 @@ #include <linux/atmdev.h> #include <linux/atmclip.h> /* CLIP_*ENCAP */ #include <linux/atmarp.h> /* manifest constants */ +#include <linux/capability.h> #include <linux/sonet.h> /* for ioctls */ #include <linux/atmsvc.h> #include <linux/atmmpc.h> diff --git a/net/atm/lec.c b/net/atm/lec.c index ad840b9afba8..c4fc722fef9a 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -7,6 +7,7 @@ #include <linux/config.h> #include <linux/kernel.h> #include <linux/bitops.h> +#include <linux/capability.h> /* We are ethernet device */ #include <linux/if_ether.h> @@ -1321,7 +1322,7 @@ static int lane2_associate_req (struct net_device *dev, u8 *lan_dst, struct sk_buff *skb; struct lec_priv *priv = (struct lec_priv*)dev->priv; - if ( memcmp(lan_dst, dev->dev_addr, ETH_ALEN) != 0 ) + if (compare_ether_addr(lan_dst, dev->dev_addr)) return (0); /* not our mac address */ kfree(priv->tlvs); /* NULL if there was no previous association */ @@ -1798,7 +1799,7 @@ lec_arp_find(struct lec_priv *priv, to_return = priv->lec_arp_tables[place]; while(to_return) { - if (memcmp(mac_addr, to_return->mac_addr, ETH_ALEN) == 0) { + if (!compare_ether_addr(mac_addr, to_return->mac_addr)) { return to_return; } to_return = to_return->next; @@ -1811,8 +1812,7 @@ make_entry(struct lec_priv *priv, unsigned char *mac_addr) { struct lec_arp_table *to_return; - to_return = (struct lec_arp_table *) kmalloc(sizeof(struct lec_arp_table), - GFP_ATOMIC); + to_return = kmalloc(sizeof(struct lec_arp_table), GFP_ATOMIC); if (!to_return) { printk("LEC: Arp entry kmalloc failed\n"); return NULL; @@ -2002,7 +2002,7 @@ lec_arp_resolve(struct lec_priv *priv, unsigned char *mac_to_find, return priv->mcast_vcc; break; case 2: /* LANE2 wants arp for multicast addresses */ - if ( memcmp(mac_to_find, bus_mac, ETH_ALEN) == 0) + if (!compare_ether_addr(mac_to_find, bus_mac)) return priv->mcast_vcc; break; default: diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 526d9531411f..c304ef1513b9 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -3,6 +3,7 @@ #include <linux/timer.h> #include <linux/init.h> #include <linux/bitops.h> +#include <linux/capability.h> #include <linux/seq_file.h> /* We are an ethernet device */ @@ -552,7 +553,7 @@ static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev) goto non_ip; /* Multi-Protocol Over ATM :-) */ while (i < mpc->number_of_mps_macs) { - if (memcmp(eth->h_dest, (mpc->mps_macs + i*ETH_ALEN), ETH_ALEN) == 0) + if (!compare_ether_addr(eth->h_dest, (mpc->mps_macs + i*ETH_ALEN))) if ( send_via_shortcut(skb, mpc) == 0 ) /* try shortcut */ return 0; /* success! */ i++; diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 58f4a2b5aebe..1489067c1e84 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -39,6 +39,7 @@ #include <linux/skbuff.h> #include <linux/atm.h> #include <linux/atmdev.h> +#include <linux/capability.h> #include <linux/ppp_defs.h> #include <linux/if_ppp.h> #include <linux/ppp_channel.h> diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 2684a92da22b..f2c541774dcd 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -102,7 +102,7 @@ static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr, } -static struct proto_ops pvc_proto_ops = { +static const struct proto_ops pvc_proto_ops = { .family = PF_ATMPVC, .owner = THIS_MODULE, diff --git a/net/atm/raw.c b/net/atm/raw.c index 4a0466e91aa6..3e57b17ca523 100644 --- a/net/atm/raw.c +++ b/net/atm/raw.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/sched.h> #include <linux/atmdev.h> +#include <linux/capability.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/mm.h> diff --git a/net/atm/resources.c b/net/atm/resources.c index c8c459fcb038..224190537c90 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -16,6 +16,7 @@ #include <linux/kernel.h> /* for barrier */ #include <linux/module.h> #include <linux/bitops.h> +#include <linux/capability.h> #include <linux/delay.h> #include <net/sock.h> /* for struct sock */ diff --git a/net/atm/svc.c b/net/atm/svc.c index d7b266136bf6..3a180cfd7b48 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -613,7 +613,7 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return error; } -static struct proto_ops svc_proto_ops = { +static const struct proto_ops svc_proto_ops = { .family = PF_ATMSVC, .owner = THIS_MODULE, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 1b683f302657..dbf9b47681f7 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -14,6 +14,7 @@ * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) */ #include <linux/config.h> +#include <linux/capability.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> @@ -54,7 +55,7 @@ HLIST_HEAD(ax25_list); DEFINE_SPINLOCK(ax25_list_lock); -static struct proto_ops ax25_proto_ops; +static const struct proto_ops ax25_proto_ops; static void ax25_free_sock(struct sock *sk) { @@ -1827,7 +1828,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; default: - res = dev_ioctl(cmd, argp); + res = -ENOIOCTLCMD; break; } release_sock(sk); @@ -1944,7 +1945,7 @@ static struct net_proto_family ax25_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops ax25_proto_ops = { +static const struct proto_ops ax25_proto_ops = { .family = PF_AX25, .owner = THIS_MODULE, .release = ax25_release, diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index b1e945bd6ed3..f04f8630fd28 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -11,6 +11,8 @@ * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) */ + +#include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c index d53cc8615865..b8b5854bce9a 100644 --- a/net/ax25/ax25_uid.c +++ b/net/ax25/ax25_uid.c @@ -6,6 +6,8 @@ * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ + +#include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index ea616e3fc98e..fb031fe9be9e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -287,10 +287,9 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo) timeo = schedule_timeout(timeo); lock_sock(sk); - if (sk->sk_err) { - err = sock_error(sk); + err = sock_error(sk); + if (err) break; - } } set_current_state(TASK_RUNNING); remove_wait_queue(sk->sk_sleep, &wait); diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 682bf20af52d..cbb20c32a6c8 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -75,7 +75,7 @@ static struct bnep_session *__bnep_get_session(u8 *dst) list_for_each(p, &bnep_session_list) { s = list_entry(p, struct bnep_session, list); - if (!memcmp(dst, s->eh.h_source, ETH_ALEN)) + if (!compare_ether_addr(dst, s->eh.h_source)) return s; } return NULL; @@ -420,10 +420,10 @@ static inline int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb) iv[il++] = (struct kvec) { &type, 1 }; len++; - if (!memcmp(eh->h_dest, s->eh.h_source, ETH_ALEN)) + if (!compare_ether_addr(eh->h_dest, s->eh.h_source)) type |= 0x01; - if (!memcmp(eh->h_source, s->eh.h_dest, ETH_ALEN)) + if (!compare_ether_addr(eh->h_source, s->eh.h_dest)) type |= 0x02; if (type) diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 9778c6acd53b..2bfe796cf05d 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -32,6 +32,7 @@ #include <linux/module.h> #include <linux/types.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -146,7 +147,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return 0; } -static struct proto_ops bnep_sock_ops = { +static const struct proto_ops bnep_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = bnep_sock_release, diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index beb045bf5714..8f8fad23f78a 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -24,6 +24,7 @@ #include <linux/module.h> #include <linux/types.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -137,7 +138,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } -static struct proto_ops cmtp_sock_ops = { +static const struct proto_ops cmtp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = cmtp_sock_release, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index a31244e58888..f812ed129e58 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -403,7 +403,7 @@ int hci_get_conn_list(void __user *arg) size = sizeof(req) + req.conn_num * sizeof(*ci); - if (!(cl = (void *) kmalloc(size, GFP_KERNEL))) + if (!(cl = kmalloc(size, GFP_KERNEL))) return -ENOMEM; if (!(hdev = hci_dev_get(req.dev_id))) { diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1d6d0a15c099..bdb6458c6bd5 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -28,6 +28,7 @@ #include <linux/module.h> #include <linux/types.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -575,7 +576,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char return 0; } -static struct proto_ops hci_sock_ops = { +static const struct proto_ops hci_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hci_sock_release, diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index bd7568ac87fc..0ed38740388c 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -78,7 +78,7 @@ static struct class_device_attribute *bt_attrs[] = { }; #ifdef CONFIG_HOTPLUG -static int bt_hotplug(struct class_device *cdev, char **envp, int num_envp, char *buf, int size) +static int bt_uevent(struct class_device *cdev, char **envp, int num_envp, char *buf, int size) { struct hci_dev *hdev = class_get_devdata(cdev); int n, i = 0; @@ -107,7 +107,7 @@ struct class bt_class = { .name = "bluetooth", .release = bt_release, #ifdef CONFIG_HOTPLUG - .hotplug = bt_hotplug, + .uevent = bt_uevent, #endif }; diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index f8986f881431..b8f67761b886 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -24,6 +24,7 @@ #include <linux/module.h> #include <linux/types.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -143,7 +144,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } -static struct proto_ops hidp_sock_ops = { +static const struct proto_ops hidp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hidp_sock_release, diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index e3bb11ca4235..f6b4a8085357 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -28,6 +28,7 @@ #include <linux/module.h> #include <linux/types.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -57,7 +58,7 @@ #define VERSION "2.8" -static struct proto_ops l2cap_sock_ops; +static const struct proto_ops l2cap_sock_ops; static struct bt_sock_list l2cap_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -767,8 +768,9 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms BT_DBG("sock %p, sk %p", sock, sk); - if (sk->sk_err) - return sock_error(sk); + err = sock_error(sk); + if (err) + return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; @@ -2160,7 +2162,7 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL); -static struct proto_ops l2cap_sock_ops = { +static const struct proto_ops l2cap_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = l2cap_sock_release, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 6c34261b232e..757d2dd3b02f 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -58,7 +58,7 @@ #define BT_DBG(D...) #endif -static struct proto_ops rfcomm_sock_ops; +static const struct proto_ops rfcomm_sock_ops; static struct bt_sock_list rfcomm_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -907,7 +907,7 @@ static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL); -static struct proto_ops rfcomm_sock_ops = { +static const struct proto_ops rfcomm_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = rfcomm_sock_release, diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 158a9c46d863..74368f79ee5d 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -34,6 +34,7 @@ #include <linux/tty_driver.h> #include <linux/tty_flip.h> +#include <linux/capability.h> #include <linux/slab.h> #include <linux/skbuff.h> @@ -480,13 +481,8 @@ static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb) BT_DBG("dlc %p tty %p len %d", dlc, tty, skb->len); if (test_bit(TTY_DONT_FLIP, &tty->flags)) { - register int i; - for (i = 0; i < skb->len; i++) { - if (tty->flip.count >= TTY_FLIPBUF_SIZE) - tty_flip_buffer_push(tty); - - tty_insert_flip_char(tty, skb->data[i], 0); - } + tty_buffer_request_room(tty, skb->len); + tty_insert_flip_string(tty, skb->data, skb->len); tty_flip_buffer_push(tty); } else tty->ldisc.receive_buf(tty, skb->data, NULL, skb->len); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 9cb00dc6c08c..6b61323ce23c 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -56,7 +56,7 @@ #define VERSION "0.5" -static struct proto_ops sco_sock_ops; +static const struct proto_ops sco_sock_ops; static struct bt_sock_list sco_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -637,8 +637,9 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock, BT_DBG("sock %p, sk %p", sock, sk); - if (sk->sk_err) - return sock_error(sk); + err = sock_error(sk); + if (err) + return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; @@ -913,7 +914,7 @@ static ssize_t sco_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL); -static struct proto_ops sco_sock_ops = { +static const struct proto_ops sco_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = sco_sock_release, diff --git a/net/bridge/br.c b/net/bridge/br.c index f8f184942aaf..188cc1ac49eb 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -67,3 +67,4 @@ EXPORT_SYMBOL(br_should_route_hook); module_init(br_init) module_exit(br_deinit) MODULE_LICENSE("GPL"); +MODULE_VERSION(BR_VERSION); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f564ee99782d..0b33a7b3a00c 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -15,7 +15,9 @@ #include <linux/kernel.h> #include <linux/netdevice.h> -#include <linux/module.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> + #include <asm/uaccess.h> #include "br_private.h" @@ -82,6 +84,87 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) return 0; } +/* Allow setting mac address of pseudo-bridge to be same as + * any of the bound interfaces + */ +static int br_set_mac_address(struct net_device *dev, void *p) +{ + struct net_bridge *br = netdev_priv(dev); + struct sockaddr *addr = p; + struct net_bridge_port *port; + int err = -EADDRNOTAVAIL; + + spin_lock_bh(&br->lock); + list_for_each_entry(port, &br->port_list, list) { + if (!compare_ether_addr(port->dev->dev_addr, addr->sa_data)) { + br_stp_change_bridge_id(br, addr->sa_data); + err = 0; + break; + } + } + spin_unlock_bh(&br->lock); + + return err; +} + +static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) +{ + strcpy(info->driver, "bridge"); + strcpy(info->version, BR_VERSION); + strcpy(info->fw_version, "N/A"); + strcpy(info->bus_info, "N/A"); +} + +static int br_set_sg(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_SG; + else + br->feature_mask &= ~NETIF_F_SG; + + br_features_recompute(br); + return 0; +} + +static int br_set_tso(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_TSO; + else + br->feature_mask &= ~NETIF_F_TSO; + + br_features_recompute(br); + return 0; +} + +static int br_set_tx_csum(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_IP_CSUM; + else + br->feature_mask &= ~NETIF_F_IP_CSUM; + + br_features_recompute(br); + return 0; +} + +static struct ethtool_ops br_ethtool_ops = { + .get_drvinfo = br_getinfo, + .get_link = ethtool_op_get_link, + .get_sg = ethtool_op_get_sg, + .set_sg = br_set_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = br_set_tx_csum, + .get_tso = ethtool_op_get_tso, + .set_tso = br_set_tso, +}; + void br_dev_setup(struct net_device *dev) { memset(dev->dev_addr, 0, ETH_ALEN); @@ -96,8 +179,12 @@ void br_dev_setup(struct net_device *dev) dev->change_mtu = br_change_mtu; dev->destructor = free_netdev; SET_MODULE_OWNER(dev); + SET_ETHTOOL_OPS(dev, &br_ethtool_ops); dev->stop = br_dev_stop; dev->tx_queue_len = 0; - dev->set_mac_address = NULL; + dev->set_mac_address = br_set_mac_address; dev->priv_flags = IFF_EBRIDGE; + + dev->features = NETIF_F_SG | NETIF_F_FRAGLIST + | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 975abe254b7a..ba442883e877 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -20,6 +20,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/rtnetlink.h> +#include <linux/if_ether.h> #include <net/sock.h> #include "br_private.h" @@ -32,9 +33,8 @@ * ethtool, use ethtool_ops. Also, since driver might sleep need to * not be holding any locks. */ -static int br_initial_port_cost(struct net_device *dev) +static int port_cost(struct net_device *dev) { - struct ethtool_cmd ecmd = { ETHTOOL_GSET }; struct ifreq ifr; mm_segment_t old_fs; @@ -58,10 +58,6 @@ static int br_initial_port_cost(struct net_device *dev) return 2; case SPEED_10: return 100; - default: - pr_info("bridge: can't decode speed from %s: %d\n", - dev->name, ecmd.speed); - return 100; } } @@ -75,6 +71,35 @@ static int br_initial_port_cost(struct net_device *dev) return 100; /* assume old 10Mbps */ } + +/* + * Check for port carrier transistions. + * Called from work queue to allow for calling functions that + * might sleep (such as speed check), and to debounce. + */ +static void port_carrier_check(void *arg) +{ + struct net_bridge_port *p = arg; + + rtnl_lock(); + if (netif_carrier_ok(p->dev)) { + u32 cost = port_cost(p->dev); + + spin_lock_bh(&p->br->lock); + if (p->state == BR_STATE_DISABLED) { + p->path_cost = cost; + br_stp_enable_port(p); + } + spin_unlock_bh(&p->br->lock); + } else { + spin_lock_bh(&p->br->lock); + if (p->state != BR_STATE_DISABLED) + br_stp_disable_port(p); + spin_unlock_bh(&p->br->lock); + } + rtnl_unlock(); +} + static void destroy_nbp(struct net_bridge_port *p) { struct net_device *dev = p->dev; @@ -102,6 +127,9 @@ static void del_nbp(struct net_bridge_port *p) dev->br_port = NULL; dev_set_promiscuity(dev, -1); + cancel_delayed_work(&p->carrier_check); + flush_scheduled_work(); + spin_lock_bh(&br->lock); br_stp_disable_port(p); spin_unlock_bh(&br->lock); @@ -155,6 +183,7 @@ static struct net_device *new_bridge_dev(const char *name) br->bridge_id.prio[1] = 0x00; memset(br->bridge_id.addr, 0, ETH_ALEN); + br->feature_mask = dev->features; br->stp_enabled = 0; br->designated_root = br->bridge_id; br->root_path_cost = 0; @@ -195,10 +224,9 @@ static int find_portno(struct net_bridge *br) return (index >= BR_MAX_PORTS) ? -EXFULL : index; } -/* called with RTNL */ +/* called with RTNL but without bridge lock */ static struct net_bridge_port *new_nbp(struct net_bridge *br, - struct net_device *dev, - unsigned long cost) + struct net_device *dev) { int index; struct net_bridge_port *p; @@ -215,12 +243,13 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->br = br; dev_hold(dev); p->dev = dev; - p->path_cost = cost; + p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; dev->br_port = p; p->port_no = index; br_init_port(p); p->state = BR_STATE_DISABLED; + INIT_WORK(&p->carrier_check, port_carrier_check, p); kobject_init(&p->kobj); return p; @@ -295,7 +324,7 @@ int br_del_bridge(const char *name) return ret; } -/* Mtu of the bridge pseudo-device 1500 or the minimum of the ports */ +/* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */ int br_min_mtu(const struct net_bridge *br) { const struct net_bridge_port *p; @@ -304,7 +333,7 @@ int br_min_mtu(const struct net_bridge *br) ASSERT_RTNL(); if (list_empty(&br->port_list)) - mtu = 1500; + mtu = ETH_DATA_LEN; else { list_for_each_entry(p, &br->port_list, list) { if (!mtu || p->dev->mtu < mtu) @@ -322,9 +351,8 @@ void br_features_recompute(struct net_bridge *br) struct net_bridge_port *p; unsigned long features, checksum; - features = NETIF_F_SG | NETIF_F_FRAGLIST - | NETIF_F_HIGHDMA | NETIF_F_TSO; - checksum = NETIF_F_IP_CSUM; /* least commmon subset */ + features = br->feature_mask &~ NETIF_F_IP_CSUM; + checksum = br->feature_mask & NETIF_F_IP_CSUM; list_for_each_entry(p, &br->port_list, list) { if (!(p->dev->features @@ -351,7 +379,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (dev->br_port != NULL) return -EBUSY; - if (IS_ERR(p = new_nbp(br, dev, br_initial_port_cost(dev)))) + if (IS_ERR(p = new_nbp(br, dev))) return PTR_ERR(p); if ((err = br_fdb_insert(br, p, dev->dev_addr))) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index b88220a64cd8..e3a73cead6b6 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -53,6 +53,11 @@ int br_handle_frame_finish(struct sk_buff *skb) /* insert into forwarding database after filtering to avoid spoofing */ br_fdb_update(p->br, p, eth_hdr(skb)->h_source); + if (p->state == BR_STATE_LEARNING) { + kfree_skb(skb); + goto out; + } + if (br->dev->flags & IFF_PROMISC) { struct sk_buff *skb2; @@ -63,7 +68,7 @@ int br_handle_frame_finish(struct sk_buff *skb) } } - if (dest[0] & 1) { + if (is_multicast_ether_addr(dest)) { br_flood_forward(br, skb, !passedup); if (!passedup) br_pass_frame_up(br, skb); @@ -107,9 +112,6 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) goto err; - if (p->state == BR_STATE_LEARNING) - br_fdb_update(p->br, p, eth_hdr(skb)->h_source); - if (p->br->stp_enabled && !memcmp(dest, bridge_ula, 5) && !(dest[5] & 0xF0)) { @@ -118,9 +120,10 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) NULL, br_stp_handle_bpdu); return 1; } + goto err; } - else if (p->state == BR_STATE_FORWARDING) { + if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) { if (br_should_route_hook) { if (br_should_route_hook(pskb)) return 0; diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index b8ce14b22181..159fb8409824 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -13,6 +13,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/capability.h> #include <linux/kernel.h> #include <linux/if_bridge.h> #include <linux/netdevice.h> diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 23422bd53a5e..7cac3fb9f809 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -26,6 +26,7 @@ #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/netfilter_bridge.h> @@ -33,8 +34,11 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter_arp.h> #include <linux/in_route.h> + #include <net/ip.h> #include <net/ipv6.h> +#include <net/route.h> + #include <asm/uaccess.h> #include <asm/checksum.h> #include "br_private.h" @@ -390,8 +394,9 @@ inhdr_error: * target in particular. Save the original destination IP * address to be able to detect DNAT afterwards. */ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { struct iphdr *iph; __u32 len; @@ -408,8 +413,10 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, goto out; if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + u8 *vhdr = skb->data; skb_pull(skb, VLAN_HLEN); - (skb)->nh.raw += VLAN_HLEN; + skb_postpull_rcsum(skb, vhdr, VLAN_HLEN); + skb->nh.raw += VLAN_HLEN; } return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn); } @@ -425,8 +432,10 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, goto out; if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + u8 *vhdr = skb->data; skb_pull(skb, VLAN_HLEN); - (skb)->nh.raw += VLAN_HLEN; + skb_postpull_rcsum(skb, vhdr, VLAN_HLEN); + skb->nh.raw += VLAN_HLEN; } if (!pskb_may_pull(skb, sizeof(struct iphdr))) diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index 917311c6828b..a43a9c1d50d7 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -52,17 +52,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v br_stp_recalculate_bridge_id(br); break; - case NETDEV_CHANGE: /* device is up but carrier changed */ - if (!(br->dev->flags & IFF_UP)) - break; - - if (netif_carrier_ok(dev)) { - if (p->state == BR_STATE_DISABLED) - br_stp_enable_port(p); - } else { - if (p->state != BR_STATE_DISABLED) - br_stp_disable_port(p); - } + case NETDEV_CHANGE: + if (br->dev->flags & IFF_UP) + schedule_delayed_work(&p->carrier_check, BR_PORT_DEBOUNCE); break; case NETDEV_FEAT_CHANGE: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index bdf95a74d8cd..c5bd631ffcd5 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -27,6 +27,10 @@ #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<<BR_PORT_BITS) +#define BR_PORT_DEBOUNCE (HZ/10) + +#define BR_VERSION "2.1" + typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; @@ -78,6 +82,7 @@ struct net_bridge_port struct timer_list hold_timer; struct timer_list message_age_timer; struct kobject kobj; + struct work_struct carrier_check; struct rcu_head rcu; }; @@ -90,6 +95,7 @@ struct net_bridge spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; struct list_head age_list; + unsigned long feature_mask; /* STP */ bridge_id designated_root; @@ -201,6 +207,7 @@ extern void br_stp_disable_bridge(struct net_bridge *br); extern void br_stp_enable_port(struct net_bridge_port *p); extern void br_stp_disable_port(struct net_bridge_port *p); extern void br_stp_recalculate_bridge_id(struct net_bridge *br); +extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a); extern void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio); extern void br_stp_set_port_priority(struct net_bridge_port *p, diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index ac09b6a23523..cc047f7fb6ef 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -120,8 +120,7 @@ void br_stp_disable_port(struct net_bridge_port *p) } /* called under bridge lock */ -static void br_stp_change_bridge_id(struct net_bridge *br, - const unsigned char *addr) +void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) { unsigned char oldaddr[6]; struct net_bridge_port *p; @@ -158,7 +157,7 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br) list_for_each_entry(p, &br->port_list, list) { if (addr == br_mac_zero || - compare_ether_addr(p->dev->dev_addr, addr) < 0) + memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0) addr = p->dev->dev_addr; } diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 98cf53c81fad..6f577f16c4c0 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -11,6 +11,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_bridge.h> diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index f6a19d53eaeb..0ac0355d16dd 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -11,6 +11,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_bridge.h> @@ -248,7 +249,7 @@ int br_sysfs_addif(struct net_bridge_port *p) if (err) goto out2; - kobject_hotplug(&p->kobj, KOBJ_ADD); + kobject_uevent(&p->kobj, KOBJ_ADD); return 0; out2: kobject_del(&p->kobj); @@ -260,7 +261,7 @@ void br_sysfs_removeif(struct net_bridge_port *p) { pr_debug("br_sysfs_removeif\n"); sysfs_remove_link(&p->br->ifobj, p->dev->name); - kobject_hotplug(&p->kobj, KOBJ_REMOVE); + kobject_uevent(&p->kobj, KOBJ_REMOVE); kobject_del(&p->kobj); } diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index c70b3be23026..b84fc6075fe1 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -196,9 +196,13 @@ config BRIDGE_EBT_LOG To compile it as a module, choose M here. If unsure, say N. config BRIDGE_EBT_ULOG - tristate "ebt: ulog support" + tristate "ebt: ulog support (OBSOLETE)" depends on BRIDGE_NF_EBTABLES help + This option enables the old bridge-specific "ebt_ulog" implementation + which has been obsoleted by the new "nfnetlink_log" code (see + CONFIG_NETFILTER_NETLINK_LOG). + This option adds the ulog watcher, that you can use in any rule in any ebtables table. The packet is passed to a userspace logging daemon using netlink multicast sockets. This differs diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index 7323805b9726..f158fe67dd60 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -15,6 +15,7 @@ #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_ip.h> #include <linux/ip.h> +#include <net/ip.h> #include <linux/in.h> #include <linux/module.h> @@ -51,6 +52,8 @@ static int ebt_filter_ip(const struct sk_buff *skb, const struct net_device *in, if (!(info->bitmask & EBT_IP_DPORT) && !(info->bitmask & EBT_IP_SPORT)) return EBT_MATCH; + if (ntohs(ih->frag_off) & IP_OFFSET) + return EBT_NOMATCH; pptr = skb_header_pointer(skb, ih->ihl*4, sizeof(_ports), &_ports); if (pptr == NULL) diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 662975be3d1d..a29c1232c420 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -3,15 +3,19 @@ * * Authors: * Bart De Schuymer <bdschuym@pandora.be> + * Harald Welte <laforge@netfilter.org> * * April, 2002 * */ +#include <linux/in.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_log.h> +#include <linux/netfilter.h> #include <linux/module.h> #include <linux/ip.h> +#include <linux/in.h> #include <linux/if_arp.h> #include <linux/spinlock.h> @@ -55,27 +59,30 @@ static void print_MAC(unsigned char *p) } #define myNIPQUAD(a) a[0], a[1], a[2], a[3] -static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, - const struct net_device *in, const struct net_device *out, - const void *data, unsigned int datalen) +static void +ebt_log_packet(unsigned int pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *loginfo, + const char *prefix) { - struct ebt_log_info *info = (struct ebt_log_info *)data; - char level_string[4] = "< >"; + unsigned int bitmask; - level_string[1] = '0' + info->loglevel; spin_lock_bh(&ebt_log_lock); - printk(level_string); - printk("%s IN=%s OUT=%s ", info->prefix, in ? in->name : "", - out ? out->name : ""); + printk("<%c>%s IN=%s OUT=%s MAC source = ", '0' + loginfo->u.log.level, + prefix, in ? in->name : "", out ? out->name : ""); - printk("MAC source = "); print_MAC(eth_hdr(skb)->h_source); printk("MAC dest = "); print_MAC(eth_hdr(skb)->h_dest); printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto)); - if ((info->bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == + if (loginfo->type == NF_LOG_TYPE_LOG) + bitmask = loginfo->u.log.logflags; + else + bitmask = NF_LOG_MASK; + + if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == htons(ETH_P_IP)){ struct iphdr _iph, *ih; @@ -84,10 +91,9 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, printk(" INCOMPLETE IP header"); goto out; } - printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", - NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); - printk(" IP tos=0x%02X, IP proto=%d", ih->tos, - ih->protocol); + printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u, IP " + "tos=0x%02X, IP proto=%d", NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr), ih->tos, ih->protocol); if (ih->protocol == IPPROTO_TCP || ih->protocol == IPPROTO_UDP) { struct tcpudphdr _ports, *pptr; @@ -104,7 +110,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, goto out; } - if ((info->bitmask & EBT_LOG_ARP) && + if ((bitmask & EBT_LOG_ARP) && ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) || (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) { struct arphdr _arph, *ah; @@ -144,6 +150,21 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, out: printk("\n"); spin_unlock_bh(&ebt_log_lock); + +} + +static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_log_info *info = (struct ebt_log_info *)data; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = info->loglevel; + li.u.log.logflags = info->bitmask; + + nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, info->prefix); } static struct ebt_watcher log = @@ -154,13 +175,32 @@ static struct ebt_watcher log = .me = THIS_MODULE, }; +static struct nf_logger ebt_log_logger = { + .name = "ebt_log", + .logfn = &ebt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { - return ebt_register_watcher(&log); + int ret; + + ret = ebt_register_watcher(&log); + if (ret < 0) + return ret; + if (nf_log_register(PF_BRIDGE, &ebt_log_logger) < 0) { + printk(KERN_WARNING "ebt_log: not logging via system console " + "since somebody else already registered for PF_INET\n"); + /* we cannot make module load fail here, since otherwise + * ebtables userspace would abort */ + } + + return 0; } static void __exit fini(void) { + nf_log_unregister_logger(&ebt_log_logger); ebt_unregister_watcher(&log); } diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index f8a8cdec16ee..0248c67277ee 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -10,6 +10,7 @@ #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_stp.h> +#include <linux/etherdevice.h> #include <linux/module.h> #define BPDU_TYPE_CONFIG 0 @@ -164,8 +165,8 @@ static int ebt_stp_check(const char *tablename, unsigned int hookmask, if (datalen != len) return -EINVAL; /* Make sure the match only receives stp frames */ - if (memcmp(e->destmac, bridge_ula, ETH_ALEN) || - memcmp(e->destmsk, msk, ETH_ALEN) || !(e->bitmask & EBT_DESTMAC)) + if (compare_ether_addr(e->destmac, bridge_ula) || + compare_ether_addr(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC)) return -EINVAL; return 0; diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index aae26ae2e61f..ce617b3dbbb8 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -3,6 +3,7 @@ * * Authors: * Bart De Schuymer <bdschuym@pandora.be> + * Harald Welte <laforge@netfilter.org> * * November, 2004 * @@ -115,14 +116,13 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) return skb; } -static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, +static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const void *data, unsigned int datalen) + const struct ebt_ulog_info *uloginfo, const char *prefix) { ebt_ulog_packet_msg_t *pm; size_t size, copy_len; struct nlmsghdr *nlh; - struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data; unsigned int group = uloginfo->nlgroup; ebt_ulog_buff_t *ub = &ulog_buffers[group]; spinlock_t *lock = &ub->lock; @@ -216,6 +216,39 @@ alloc_failure: goto unlock; } +/* this function is registered with the netfilter core */ +static void ebt_log_packet(unsigned int pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *li, + const char *prefix) +{ + struct ebt_ulog_info loginfo; + + if (!li || li->type != NF_LOG_TYPE_ULOG) { + loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP; + loginfo.cprange = 0; + loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD; + loginfo.prefix[0] = '\0'; + } else { + loginfo.nlgroup = li->u.ulog.group; + loginfo.cprange = li->u.ulog.copy_len; + loginfo.qthreshold = li->u.ulog.qthreshold; + strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); + } + + ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); +} + +static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data; + + ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL); +} + + static int ebt_ulog_check(const char *tablename, unsigned int hookmask, const struct ebt_entry *e, void *data, unsigned int datalen) { @@ -240,6 +273,12 @@ static struct ebt_watcher ulog = { .me = THIS_MODULE, }; +static struct nf_logger ebt_ulog_logger = { + .name = EBT_ULOG_WATCHER, + .logfn = &ebt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { int i, ret = 0; @@ -265,6 +304,13 @@ static int __init init(void) else if ((ret = ebt_register_watcher(&ulog))) sock_release(ebtulognl->sk_socket); + if (nf_log_register(PF_BRIDGE, &ebt_ulog_logger) < 0) { + printk(KERN_WARNING "ebt_ulog: not logging via ulog " + "since somebody else already registered for PF_BRIDGE\n"); + /* we cannot make module load fail here, since otherwise + * ebtables userspace would abort */ + } + return ret; } @@ -273,6 +319,7 @@ static void __exit fini(void) ebt_ulog_buff_t *ub; int i; + nf_log_unregister_logger(&ebt_ulog_logger); ebt_unregister_watcher(&ulog); for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { ub = &ulog_buffers[i]; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index f8ffbf6e2333..00729b3604f8 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -944,7 +944,7 @@ static int do_replace(void __user *user, unsigned int len) if (countersize) memset(newinfo->counters, 0, countersize); - newinfo->entries = (char *)vmalloc(tmp.entries_size); + newinfo->entries = vmalloc(tmp.entries_size); if (!newinfo->entries) { ret = -ENOMEM; goto free_newinfo; @@ -1146,7 +1146,7 @@ int ebt_register_table(struct ebt_table *table) if (!newinfo) return -ENOMEM; - newinfo->entries = (char *)vmalloc(table->table->entries_size); + newinfo->entries = vmalloc(table->table->entries_size); if (!(newinfo->entries)) goto free_newinfo; diff --git a/net/core/datagram.c b/net/core/datagram.c index 1bcfef51ac58..f8d322e1ea92 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -47,6 +47,7 @@ #include <linux/rtnetlink.h> #include <linux/poll.h> #include <linux/highmem.h> +#include <linux/spinlock.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -200,6 +201,41 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } /** + * skb_kill_datagram - Free a datagram skbuff forcibly + * @sk: socket + * @skb: datagram skbuff + * @flags: MSG_ flags + * + * This function frees a datagram skbuff that was received by + * skb_recv_datagram. The flags argument must match the one + * used for skb_recv_datagram. + * + * If the MSG_PEEK flag is set, and the packet is still on the + * receive queue of the socket, it will be taken off the queue + * before it is freed. + * + * This function currently only disables BH when acquiring the + * sk_receive_queue lock. Therefore it must not be used in a + * context where that lock is acquired in an IRQ context. + */ + +void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) +{ + if (flags & MSG_PEEK) { + spin_lock_bh(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + atomic_dec(&skb->users); + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + + kfree_skb(skb); +} + +EXPORT_SYMBOL(skb_kill_datagram); + +/** * skb_copy_datagram_iovec - Copy a datagram to an iovec. * @skb: buffer to copy * @offset: offset in the buffer to start copying from diff --git a/net/core/dev.c b/net/core/dev.c index a5efc9ae010b..fd070a098f20 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -75,6 +75,7 @@ #include <asm/uaccess.h> #include <asm/system.h> #include <linux/bitops.h> +#include <linux/capability.h> #include <linux/config.h> #include <linux/cpu.h> #include <linux/types.h> @@ -626,7 +627,7 @@ struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mas * Network device names need to be valid file names to * to allow sysfs to work */ -static int dev_valid_name(const char *name) +int dev_valid_name(const char *name) { return !(*name == '\0' || !strcmp(name, ".") @@ -1092,15 +1093,12 @@ int skb_checksum_help(struct sk_buff *skb, int inward) goto out; } - if (offset > (int)skb->len) - BUG(); + BUG_ON(offset > (int)skb->len); csum = skb_checksum(skb, offset, skb->len-offset, 0); offset = skb->tail - skb->h.raw; - if (offset <= 0) - BUG(); - if (skb->csum + 2 > offset) - BUG(); + BUG_ON(offset <= 0); + BUG_ON(skb->csum + 2 > offset); *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum); skb->ip_summed = CHECKSUM_NONE; @@ -3270,13 +3268,13 @@ EXPORT_SYMBOL(__dev_get_by_index); EXPORT_SYMBOL(__dev_get_by_name); EXPORT_SYMBOL(__dev_remove_pack); EXPORT_SYMBOL(__skb_linearize); +EXPORT_SYMBOL(dev_valid_name); EXPORT_SYMBOL(dev_add_pack); EXPORT_SYMBOL(dev_alloc_name); EXPORT_SYMBOL(dev_close); EXPORT_SYMBOL(dev_get_by_flags); EXPORT_SYMBOL(dev_get_by_index); EXPORT_SYMBOL(dev_get_by_name); -EXPORT_SYMBOL(dev_ioctl); EXPORT_SYMBOL(dev_open); EXPORT_SYMBOL(dev_queue_xmit); EXPORT_SYMBOL(dev_remove_pack); diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index cb530eef0e39..05d60850840e 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -158,7 +158,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) int err = 0; struct dev_mc_list *dmi, *dmi1; - dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC); + dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC); spin_lock_bh(&dev->xmit_lock); for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) { diff --git a/net/core/dv.c b/net/core/dv.c index 3f25f4aa4e66..cf581407538c 100644 --- a/net/core/dv.c +++ b/net/core/dv.c @@ -24,6 +24,7 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/init.h> #include <net/dst.h> @@ -457,7 +458,7 @@ void divert_frame(struct sk_buff *skb) unsigned char *skb_data_end = skb->data + skb->len; /* Packet is already aimed at us, return */ - if (!memcmp(eth, skb->dev->dev_addr, ETH_ALEN)) + if (!compare_ether_addr(eth->h_dest, skb->dev->dev_addr)) return; /* proto is not IP, do nothing */ diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 0350586e9195..e6f76106a99b 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/types.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/ethtool.h> #include <linux/netdevice.h> diff --git a/net/core/filter.c b/net/core/filter.c index 3a10e0bc90e8..a52665f75224 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -13,6 +13,7 @@ * 2 of the License, or (at your option) any later version. * * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in sk_chk_filter() */ #include <linux/module.h> @@ -74,7 +75,7 @@ static inline void *load_pointer(struct sk_buff *skb, int k, * len is the number of filter blocks in the array. */ -int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) +unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) { struct sock_filter *fentry; /* We walk down these */ void *ptr; @@ -240,9 +241,9 @@ load_b: A = X; continue; case BPF_RET|BPF_K: - return ((unsigned int)fentry->k); + return fentry->k; case BPF_RET|BPF_A: - return ((unsigned int)A); + return A; case BPF_ST: mem[fentry->k] = A; continue; @@ -250,7 +251,7 @@ load_b: mem[fentry->k] = X; continue; default: - /* Invalid instruction counts as RET */ + WARN_ON(1); return 0; } @@ -283,10 +284,12 @@ load_b: * * Check the user's filter code. If we let some ugly * filter code slip through kaboom! The filter must contain - * no references or jumps that are out of range, no illegal instructions - * and no backward jumps. It must end with a RET instruction + * no references or jumps that are out of range, no illegal + * instructions, and must end with a RET instruction. * - * Returns 0 if the rule set is legal or a negative errno code if not. + * All jumps are forward as they are not signed. + * + * Returns 0 if the rule set is legal or -EINVAL if not. */ int sk_chk_filter(struct sock_filter *filter, int flen) { @@ -298,48 +301,89 @@ int sk_chk_filter(struct sock_filter *filter, int flen) /* check the filter code now */ for (pc = 0; pc < flen; pc++) { - /* all jumps are forward as they are not signed */ ftest = &filter[pc]; - if (BPF_CLASS(ftest->code) == BPF_JMP) { - /* but they mustn't jump off the end */ - if (BPF_OP(ftest->code) == BPF_JA) { - /* - * Note, the large ftest->k might cause loops. - * Compare this with conditional jumps below, - * where offsets are limited. --ANK (981016) - */ - if (ftest->k >= (unsigned)(flen-pc-1)) - return -EINVAL; - } else { - /* for conditionals both must be safe */ - if (pc + ftest->jt +1 >= flen || - pc + ftest->jf +1 >= flen) - return -EINVAL; - } - } - /* check for division by zero -Kris Katterjohn 2005-10-30 */ - if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0) - return -EINVAL; + /* Only allow valid instructions */ + switch (ftest->code) { + case BPF_ALU|BPF_ADD|BPF_K: + case BPF_ALU|BPF_ADD|BPF_X: + case BPF_ALU|BPF_SUB|BPF_K: + case BPF_ALU|BPF_SUB|BPF_X: + case BPF_ALU|BPF_MUL|BPF_K: + case BPF_ALU|BPF_MUL|BPF_X: + case BPF_ALU|BPF_DIV|BPF_X: + case BPF_ALU|BPF_AND|BPF_K: + case BPF_ALU|BPF_AND|BPF_X: + case BPF_ALU|BPF_OR|BPF_K: + case BPF_ALU|BPF_OR|BPF_X: + case BPF_ALU|BPF_LSH|BPF_K: + case BPF_ALU|BPF_LSH|BPF_X: + case BPF_ALU|BPF_RSH|BPF_K: + case BPF_ALU|BPF_RSH|BPF_X: + case BPF_ALU|BPF_NEG: + case BPF_LD|BPF_W|BPF_ABS: + case BPF_LD|BPF_H|BPF_ABS: + case BPF_LD|BPF_B|BPF_ABS: + case BPF_LD|BPF_W|BPF_LEN: + case BPF_LD|BPF_W|BPF_IND: + case BPF_LD|BPF_H|BPF_IND: + case BPF_LD|BPF_B|BPF_IND: + case BPF_LD|BPF_IMM: + case BPF_LDX|BPF_W|BPF_LEN: + case BPF_LDX|BPF_B|BPF_MSH: + case BPF_LDX|BPF_IMM: + case BPF_MISC|BPF_TAX: + case BPF_MISC|BPF_TXA: + case BPF_RET|BPF_K: + case BPF_RET|BPF_A: + break; + + /* Some instructions need special checks */ - /* check that memory operations use valid addresses. */ - if (ftest->k >= BPF_MEMWORDS) { - /* but it might not be a memory operation... */ - switch (ftest->code) { - case BPF_ST: - case BPF_STX: - case BPF_LD|BPF_MEM: - case BPF_LDX|BPF_MEM: + case BPF_ALU|BPF_DIV|BPF_K: + /* check for division by zero */ + if (ftest->k == 0) return -EINVAL; - } + break; + + case BPF_LD|BPF_MEM: + case BPF_LDX|BPF_MEM: + case BPF_ST: + case BPF_STX: + /* check for invalid memory addresses */ + if (ftest->k >= BPF_MEMWORDS) + return -EINVAL; + break; + + case BPF_JMP|BPF_JA: + /* + * Note, the large ftest->k might cause loops. + * Compare this with conditional jumps below, + * where offsets are limited. --ANK (981016) + */ + if (ftest->k >= (unsigned)(flen-pc-1)) + return -EINVAL; + break; + + case BPF_JMP|BPF_JEQ|BPF_K: + case BPF_JMP|BPF_JEQ|BPF_X: + case BPF_JMP|BPF_JGE|BPF_K: + case BPF_JMP|BPF_JGE|BPF_X: + case BPF_JMP|BPF_JGT|BPF_K: + case BPF_JMP|BPF_JGT|BPF_X: + case BPF_JMP|BPF_JSET|BPF_K: + case BPF_JMP|BPF_JSET|BPF_X: + /* for conditionals both must be safe */ + if (pc + ftest->jt + 1 >= flen || + pc + ftest->jf + 1 >= flen) + return -EINVAL; + break; + + default: + return -EINVAL; } } - /* - * The program must end with a return. We don't care where they - * jumped within the script (its always forwards) but in the end - * they _will_ hit this. - */ return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL; } diff --git a/net/core/flow.c b/net/core/flow.c index 7e95b39de9fd..c4f25385029f 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -23,6 +23,7 @@ #include <net/flow.h> #include <asm/atomic.h> #include <asm/semaphore.h> +#include <linux/security.h> struct flow_cache_entry { struct flow_cache_entry *next; @@ -30,6 +31,7 @@ struct flow_cache_entry { u8 dir; struct flowi key; u32 genid; + u32 sk_sid; void *object; atomic_t *object_ref; }; @@ -162,7 +164,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) return 0; } -void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, +void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir, flow_resolve_t resolver) { struct flow_cache_entry *fle, **head; @@ -186,6 +188,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && + fle->sk_sid == sk_sid && flow_key_compare(key, &fle->key) == 0) { if (fle->genid == atomic_read(&flow_cache_genid)) { void *ret = fle->object; @@ -210,6 +213,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, *head = fle; fle->family = family; fle->dir = dir; + fle->sk_sid = sk_sid; memcpy(&fle->key, key, sizeof(*key)); fle->object = NULL; flow_count(cpu)++; @@ -221,7 +225,7 @@ nocache: void *obj; atomic_t *obj_ref; - resolver(key, family, dir, &obj, &obj_ref); + resolver(key, sk_sid, family, dir, &obj, &obj_ref); if (fle) { fle->genid = atomic_read(&flow_cache_genid); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e2137f3e489d..e8b2acbc8ea2 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/capability.h> #include <linux/config.h> #include <linux/kernel.h> #include <linux/netdevice.h> @@ -16,6 +17,7 @@ #include <net/sock.h> #include <linux/rtnetlink.h> #include <linux/wireless.h> +#include <net/iw_handler.h> #define to_class_dev(obj) container_of(obj,struct class_device,kobj) #define to_net_dev(class) container_of(class, struct net_device, class_dev) @@ -84,16 +86,11 @@ static ssize_t netdev_store(struct class_device *dev, return ret; } -/* generate a read-only network device class attribute */ -#define NETDEVICE_ATTR(field, format_string) \ -NETDEVICE_SHOW(field, format_string) \ -static CLASS_DEVICE_ATTR(field, S_IRUGO, show_##field, NULL) \ - -NETDEVICE_ATTR(addr_len, fmt_dec); -NETDEVICE_ATTR(iflink, fmt_dec); -NETDEVICE_ATTR(ifindex, fmt_dec); -NETDEVICE_ATTR(features, fmt_long_hex); -NETDEVICE_ATTR(type, fmt_dec); +NETDEVICE_SHOW(addr_len, fmt_dec); +NETDEVICE_SHOW(iflink, fmt_dec); +NETDEVICE_SHOW(ifindex, fmt_dec); +NETDEVICE_SHOW(features, fmt_long_hex); +NETDEVICE_SHOW(type, fmt_dec); /* use same locking rules as GIFHWADDR ioctl's */ static ssize_t format_addr(char *buf, const unsigned char *addr, int len) @@ -136,10 +133,6 @@ static ssize_t show_carrier(struct class_device *dev, char *buf) return -EINVAL; } -static CLASS_DEVICE_ATTR(address, S_IRUGO, show_address, NULL); -static CLASS_DEVICE_ATTR(broadcast, S_IRUGO, show_broadcast, NULL); -static CLASS_DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL); - /* read-write attributes */ NETDEVICE_SHOW(mtu, fmt_dec); @@ -153,8 +146,6 @@ static ssize_t store_mtu(struct class_device *dev, const char *buf, size_t len) return netdev_store(dev, buf, len, change_mtu); } -static CLASS_DEVICE_ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu); - NETDEVICE_SHOW(flags, fmt_hex); static int change_flags(struct net_device *net, unsigned long new_flags) @@ -167,8 +158,6 @@ static ssize_t store_flags(struct class_device *dev, const char *buf, size_t len return netdev_store(dev, buf, len, change_flags); } -static CLASS_DEVICE_ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags); - NETDEVICE_SHOW(tx_queue_len, fmt_ulong); static int change_tx_queue_len(struct net_device *net, unsigned long new_len) @@ -182,9 +171,6 @@ static ssize_t store_tx_queue_len(struct class_device *dev, const char *buf, siz return netdev_store(dev, buf, len, change_tx_queue_len); } -static CLASS_DEVICE_ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, - store_tx_queue_len); - NETDEVICE_SHOW(weight, fmt_dec); static int change_weight(struct net_device *net, unsigned long new_weight) @@ -198,24 +184,21 @@ static ssize_t store_weight(struct class_device *dev, const char *buf, size_t le return netdev_store(dev, buf, len, change_weight); } -static CLASS_DEVICE_ATTR(weight, S_IRUGO | S_IWUSR, show_weight, - store_weight); - - -static struct class_device_attribute *net_class_attributes[] = { - &class_device_attr_ifindex, - &class_device_attr_iflink, - &class_device_attr_addr_len, - &class_device_attr_tx_queue_len, - &class_device_attr_features, - &class_device_attr_mtu, - &class_device_attr_flags, - &class_device_attr_weight, - &class_device_attr_type, - &class_device_attr_address, - &class_device_attr_broadcast, - &class_device_attr_carrier, - NULL +static struct class_device_attribute net_class_attributes[] = { + __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), + __ATTR(iflink, S_IRUGO, show_iflink, NULL), + __ATTR(ifindex, S_IRUGO, show_ifindex, NULL), + __ATTR(features, S_IRUGO, show_features, NULL), + __ATTR(type, S_IRUGO, show_type, NULL), + __ATTR(address, S_IRUGO, show_address, NULL), + __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), + __ATTR(carrier, S_IRUGO, show_carrier, NULL), + __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu), + __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), + __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, + store_tx_queue_len), + __ATTR(weight, S_IRUGO | S_IWUSR, show_weight, store_weight), + {} }; /* Show a given an attribute in the statistics group */ @@ -313,13 +296,19 @@ static ssize_t wireless_show(struct class_device *cd, char *buf, char *)) { struct net_device *dev = to_net_dev(cd); - const struct iw_statistics *iw; + const struct iw_statistics *iw = NULL; ssize_t ret = -EINVAL; read_lock(&dev_base_lock); - if (dev_isalive(dev) && dev->get_wireless_stats - && (iw = dev->get_wireless_stats(dev)) != NULL) - ret = (*format)(iw, buf); + if (dev_isalive(dev)) { + if(dev->wireless_handlers && + dev->wireless_handlers->get_wireless_stats) + iw = dev->wireless_handlers->get_wireless_stats(dev); + else if (dev->get_wireless_stats) + iw = dev->get_wireless_stats(dev); + if (iw != NULL) + ret = (*format)(iw, buf); + } read_unlock(&dev_base_lock); return ret; @@ -369,14 +358,14 @@ static struct attribute_group wireless_group = { #endif #ifdef CONFIG_HOTPLUG -static int netdev_hotplug(struct class_device *cd, char **envp, - int num_envp, char *buf, int size) +static int netdev_uevent(struct class_device *cd, char **envp, + int num_envp, char *buf, int size) { struct net_device *dev = to_net_dev(cd); int i = 0; int n; - /* pass interface in env to hotplug. */ + /* pass interface to uevent. */ envp[i++] = buf; n = snprintf(buf, size, "INTERFACE=%s", dev->name) + 1; buf += n; @@ -407,8 +396,9 @@ static void netdev_release(struct class_device *cd) static struct class net_class = { .name = "net", .release = netdev_release, + .class_dev_attrs = net_class_attributes, #ifdef CONFIG_HOTPLUG - .hotplug = netdev_hotplug, + .uevent = netdev_uevent, #endif }; @@ -420,7 +410,8 @@ void netdev_unregister_sysfs(struct net_device * net) sysfs_remove_group(&class_dev->kobj, &netstat_group); #ifdef WIRELESS_EXT - if (net->get_wireless_stats) + if (net->get_wireless_stats || (net->wireless_handlers && + net->wireless_handlers->get_wireless_stats)) sysfs_remove_group(&class_dev->kobj, &wireless_group); #endif class_device_del(class_dev); @@ -431,8 +422,6 @@ void netdev_unregister_sysfs(struct net_device * net) int netdev_register_sysfs(struct net_device *net) { struct class_device *class_dev = &(net->class_dev); - int i; - struct class_device_attribute *attr; int ret; class_dev->class = &net_class; @@ -442,21 +431,17 @@ int netdev_register_sysfs(struct net_device *net) if ((ret = class_device_register(class_dev))) goto out; - for (i = 0; (attr = net_class_attributes[i]) != NULL; i++) { - if ((ret = class_device_create_file(class_dev, attr))) - goto out_unreg; - } - - if (net->get_stats && (ret = sysfs_create_group(&class_dev->kobj, &netstat_group))) goto out_unreg; #ifdef WIRELESS_EXT - if (net->get_wireless_stats && - (ret = sysfs_create_group(&class_dev->kobj, &wireless_group))) - goto out_cleanup; - + if (net->get_wireless_stats || (net->wireless_handlers && + net->wireless_handlers->get_wireless_stats)) { + ret = sysfs_create_group(&class_dev->kobj, &wireless_group); + if (ret) + goto out_cleanup; + } return 0; out_cleanup: if (net->get_stats) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 49424a42a2c0..281a632fa6a6 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -13,6 +13,7 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> +#include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7fc3e9e28c34..39063122fbb7 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -116,13 +116,13 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <linux/sched.h> #include <linux/unistd.h> #include <linux/string.h> #include <linux/ptrace.h> #include <linux/errno.h> #include <linux/ioport.h> #include <linux/interrupt.h> +#include <linux/capability.h> #include <linux/delay.h> #include <linux/timer.h> #include <linux/init.h> @@ -473,7 +473,6 @@ static char version[] __initdata = VERSION; static int pktgen_remove_device(struct pktgen_thread* t, struct pktgen_dev *i); static int pktgen_add_device(struct pktgen_thread* t, const char* ifname); -static struct pktgen_thread* pktgen_find_thread(const char* name); static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread* t, const char* ifname); static int pktgen_device_event(struct notifier_block *, unsigned long, void *); static void pktgen_run_all_threads(void); @@ -487,9 +486,9 @@ static unsigned int fmt_ip6(char *s,const char ip[16]); /* Module parameters, defaults. */ static int pg_count_d = 1000; /* 1000 pkts by default */ -static int pg_delay_d = 0; -static int pg_clone_skb_d = 0; -static int debug = 0; +static int pg_delay_d; +static int pg_clone_skb_d; +static int debug; static DECLARE_MUTEX(pktgen_sem); static struct pktgen_thread *pktgen_threads = NULL; @@ -2883,7 +2882,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char* ifname) return add_dev_to_thread(t, pkt_dev); } -static struct pktgen_thread *pktgen_find_thread(const char* name) +static struct pktgen_thread * __init pktgen_find_thread(const char* name) { struct pktgen_thread *t = NULL; @@ -2900,7 +2899,7 @@ static struct pktgen_thread *pktgen_find_thread(const char* name) return t; } -static int pktgen_create_thread(const char* name, int cpu) +static int __init pktgen_create_thread(const char* name, int cpu) { struct pktgen_thread *t = NULL; struct proc_dir_entry *pe; diff --git a/net/core/scm.c b/net/core/scm.c index e887d19be506..649d01ef35b6 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/signal.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/mm.h> diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 83fee37de38e..d0732e9c8560 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -135,17 +135,13 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int fclone) { + struct skb_shared_info *shinfo; struct sk_buff *skb; u8 *data; /* Get the HEAD */ - if (fclone) - skb = kmem_cache_alloc(skbuff_fclone_cache, - gfp_mask & ~__GFP_DMA); - else - skb = kmem_cache_alloc(skbuff_head_cache, - gfp_mask & ~__GFP_DMA); - + skb = kmem_cache_alloc(fclone ? skbuff_fclone_cache : skbuff_head_cache, + gfp_mask & ~__GFP_DMA); if (!skb) goto out; @@ -162,6 +158,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb->tail = data; skb->end = data + size; + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + atomic_set(&shinfo->dataref, 1); + shinfo->nr_frags = 0; + shinfo->tso_size = 0; + shinfo->tso_segs = 0; + shinfo->ufo_size = 0; + shinfo->ip6_frag_id = 0; + shinfo->frag_list = NULL; + if (fclone) { struct sk_buff *child = skb + 1; atomic_t *fclone_ref = (atomic_t *) (child + 1); @@ -171,13 +177,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, child->fclone = SKB_FCLONE_UNAVAILABLE; } - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->tso_size = 0; - skb_shinfo(skb)->tso_segs = 0; - skb_shinfo(skb)->frag_list = NULL; - skb_shinfo(skb)->ufo_size = 0; - skb_shinfo(skb)->ip6_frag_id = 0; out: return skb; nodata: @@ -792,8 +791,7 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) int end = offset + skb_shinfo(skb)->frags[i].size; if (end > len) { if (skb_cloned(skb)) { - if (!realloc) - BUG(); + BUG_ON(!realloc); if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; } @@ -895,8 +893,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) struct sk_buff *insp = NULL; do { - if (!list) - BUG(); + BUG_ON(!list); if (list->len <= eat) { /* Eaten as whole. */ @@ -1200,8 +1197,7 @@ unsigned int skb_checksum(const struct sk_buff *skb, int offset, start = end; } } - if (len) - BUG(); + BUG_ON(len); return csum; } @@ -1283,8 +1279,7 @@ unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, start = end; } } - if (len) - BUG(); + BUG_ON(len); return csum; } @@ -1298,8 +1293,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) else csstart = skb_headlen(skb); - if (csstart > skb_headlen(skb)) - BUG(); + BUG_ON(csstart > skb_headlen(skb)); memcpy(to, skb->data, csstart); diff --git a/net/core/sock.c b/net/core/sock.c index 13cc3be4f056..6e00811d44bc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -91,6 +91,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/capability.h> #include <linux/config.h> #include <linux/errno.h> #include <linux/types.h> @@ -1488,7 +1489,7 @@ int proto_register(struct proto *prot, int alloc_slab) } } - if (prot->twsk_obj_size) { + if (prot->twsk_prot != NULL) { static const char mask[] = "tw_sock_%s"; timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); @@ -1497,11 +1498,12 @@ int proto_register(struct proto *prot, int alloc_slab) goto out_free_request_sock_slab; sprintf(timewait_sock_slab_name, mask, prot->name); - prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, - prot->twsk_obj_size, - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (prot->twsk_slab == NULL) + prot->twsk_prot->twsk_slab = + kmem_cache_create(timewait_sock_slab_name, + prot->twsk_prot->twsk_obj_size, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; } } @@ -1548,12 +1550,12 @@ void proto_unregister(struct proto *prot) prot->rsk_prot->slab = NULL; } - if (prot->twsk_slab != NULL) { - const char *name = kmem_cache_name(prot->twsk_slab); + if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { + const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab); - kmem_cache_destroy(prot->twsk_slab); + kmem_cache_destroy(prot->twsk_prot->twsk_slab); kfree(name); - prot->twsk_slab = NULL; + prot->twsk_prot->twsk_slab = NULL; } } diff --git a/net/core/stream.c b/net/core/stream.c index 15bfd03e8024..35e25259fd95 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -55,8 +55,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) int done; do { - if (sk->sk_err) - return sock_error(sk); + int err = sock_error(sk); + if (err) + return err; if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) return -EPIPE; if (!*timeo_p) @@ -67,6 +68,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, + !sk->sk_err && !((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); finish_wait(sk->sk_sleep, &wait); @@ -137,7 +139,9 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, sk_stream_memory_free(sk) && + sk_wait_event(sk, ¤t_timeo, !sk->sk_err && + !(sk->sk_shutdown & SEND_SHUTDOWN) && + sk_stream_memory_free(sk) && vm_wait); sk->sk_write_pending--; diff --git a/net/core/utils.c b/net/core/utils.c index 7b5970fc9e40..ac1d1fcf8673 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -162,7 +162,7 @@ EXPORT_SYMBOL(net_srandom); * is otherwise not dependent on the TCP/IP stack. */ -__u32 in_aton(const char *str) +__be32 in_aton(const char *str) { unsigned long l; unsigned int val; @@ -175,7 +175,7 @@ __u32 in_aton(const char *str) if (*str != '\0') { val = 0; - while (*str != '\0' && *str != '.') + while (*str != '\0' && *str != '.' && *str != '\n') { val *= 10; val += *str - '0'; diff --git a/net/core/wireless.c b/net/core/wireless.c index 271ddb35b0b2..2add7ed609e9 100644 --- a/net/core/wireless.c +++ b/net/core/wireless.c @@ -78,6 +78,7 @@ #include <linux/seq_file.h> #include <linux/init.h> /* for __init */ #include <linux/if_arp.h> /* ARPHRD_ETHER */ +#include <linux/etherdevice.h> /* compare_ether_addr */ #include <linux/wireless.h> /* Pretty obvious */ #include <net/iw_handler.h> /* New driver API */ @@ -1506,7 +1507,7 @@ void wireless_spy_update(struct net_device * dev, /* Update all records that match */ for(i = 0; i < spydata->spy_number; i++) - if(!memcmp(address, spydata->spy_address[i], ETH_ALEN)) { + if(!compare_ether_addr(address, spydata->spy_address[i])) { memcpy(&(spydata->spy_stat[i]), wstats, sizeof(struct iw_quality)); match = i; diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 344a8da153fc..87b27fff6e3b 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,3 +1,7 @@ +obj-$(CONFIG_IPV6) += dccp_ipv6.o + +dccp_ipv6-y := ipv6.o + obj-$(CONFIG_IP_DCCP) += dccp.o dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \ diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index c9a62cca22fc..ce9cb77c5c29 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -55,8 +55,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) from = av->dccpav_buf + av->dccpav_buf_head; /* Check if buf_head wraps */ - if (av->dccpav_buf_head + len > av->dccpav_vec_len) { - const u32 tailsize = (av->dccpav_vec_len - av->dccpav_buf_head); + if ((int)av->dccpav_buf_head + len > av->dccpav_vec_len) { + const u32 tailsize = av->dccpav_vec_len - av->dccpav_buf_head; memcpy(to, from, tailsize); to += tailsize; @@ -93,8 +93,14 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len, const gfp_t priority) { - struct dccp_ackvec *av = kmalloc(sizeof(*av) + len, priority); + struct dccp_ackvec *av; + BUG_ON(len == 0); + + if (len > DCCP_MAX_ACKVEC_LEN) + return NULL; + + av = kmalloc(sizeof(*av) + len, priority); if (av != NULL) { av->dccpav_buf_len = len; av->dccpav_buf_head = @@ -117,13 +123,13 @@ void dccp_ackvec_free(struct dccp_ackvec *av) } static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, - const unsigned int index) + const u8 index) { return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK; } static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, - const unsigned int index) + const u8 index) { return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK; } @@ -135,7 +141,7 @@ static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, */ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, const unsigned int packets, - const unsigned char state) + const unsigned char state) { unsigned int gap; signed long new_head; @@ -223,7 +229,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, * could reduce the complexity of this scan.) */ u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno); - unsigned int index = av->dccpav_buf_head; + u8 index = av->dccpav_buf_head; while (1) { const u8 len = dccp_ackvec_len(av, index); @@ -291,7 +297,7 @@ void dccp_ackvec_print(const struct dccp_ackvec *av) } #endif -static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av) +static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av) { /* * As we're keeping track of the ack vector size (dccpav_vec_len) and @@ -301,9 +307,10 @@ static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av) * draft-ietf-dccp-spec-11.txt Appendix A. -acme */ #if 0 - av->dccpav_buf_tail = av->dccpav_ack_ptr + 1; - if (av->dccpav_buf_tail >= av->dccpav_vec_len) - av->dccpav_buf_tail -= av->dccpav_vec_len; + u32 new_buf_tail = av->dccpav_ack_ptr + 1; + if (new_buf_tail >= av->dccpav_vec_len) + new_buf_tail -= av->dccpav_vec_len; + av->dccpav_buf_tail = new_buf_tail; #endif av->dccpav_vec_len -= av->dccpav_sent_len; } @@ -326,7 +333,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, debug_prefix, 1, (unsigned long long)av->dccpav_ack_seqno, (unsigned long long)av->dccpav_ack_ackno); - dccp_ackvec_trow_away_ack_record(av); + dccp_ackvec_throw_away_ack_record(av); av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1; } } @@ -389,7 +396,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, av->dccpav_ack_seqno, (unsigned long long) av->dccpav_ack_ackno); - dccp_ackvec_trow_away_ack_record(av); + dccp_ackvec_throw_away_ack_record(av); } /* * If dccpav_ack_seqno was not received, no problem diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index d0fd6c60c574..f7dfb5f67b87 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -54,16 +54,16 @@ * @dccpav_buf - circular buffer of acknowledgeable packets */ struct dccp_ackvec { - unsigned int dccpav_buf_head; - unsigned int dccpav_buf_tail; u64 dccpav_buf_ackno; u64 dccpav_ack_seqno; u64 dccpav_ack_ackno; - unsigned int dccpav_ack_ptr; - unsigned int dccpav_sent_len; - unsigned int dccpav_vec_len; - unsigned int dccpav_buf_len; struct timeval dccpav_time; + u8 dccpav_buf_head; + u8 dccpav_buf_tail; + u8 dccpav_ack_ptr; + u8 dccpav_sent_len; + u8 dccpav_vec_len; + u8 dccpav_buf_len; u8 dccpav_buf_nonce; u8 dccpav_ack_nonce; u8 dccpav_buf[0]; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index c37eeeaf5c6e..de681c6ad081 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -21,6 +21,8 @@ #define CCID_MAX 255 +struct tcp_info; + struct ccid { unsigned char ccid_id; const char *ccid_name; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index f97b85d55ad8..93f26dd6e6cb 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -59,7 +59,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); #define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ -extern struct proto dccp_v4_prot; +extern struct proto dccp_prot; /* is seq1 < seq2 ? */ static inline int before48(const u64 seq1, const u64 seq2) @@ -228,6 +228,9 @@ extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned len); +extern int dccp_v4_init_sock(struct sock *sk); +extern int dccp_v4_destroy_sock(struct sock *sk); + extern void dccp_close(struct sock *sk, long timeout); extern struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, @@ -238,6 +241,7 @@ extern struct sk_buff *dccp_make_reset(struct sock *sk, extern int dccp_connect(struct sock *sk); extern int dccp_disconnect(struct sock *sk, int flags); +extern void dccp_unhash(struct sock *sk); extern int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); extern int dccp_setsockopt(struct sock *sk, int level, int optname, @@ -249,6 +253,13 @@ extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); extern void dccp_shutdown(struct sock *sk, int how); +extern int inet_dccp_listen(struct socket *sock, int backlog); +extern unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait); +extern void dccp_v4_send_check(struct sock *sk, int len, + struct sk_buff *skb); +extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len); extern int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr, const u32 daddr); @@ -256,6 +267,17 @@ extern int dccp_v4_checksum(const struct sk_buff *skb, extern int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code); extern void dccp_send_close(struct sock *sk, const int active); +extern int dccp_invalid_packet(struct sk_buff *skb); + +static inline int dccp_bad_service_code(const struct sock *sk, + const __u32 service) +{ + const struct dccp_sock *dp = dccp_sk(sk); + + if (dp->dccps_service == service) + return 0; + return !dccp_list_has_service(dp->dccps_service_list, service); +} struct dccp_skb_cb { __u8 dccpd_type:4; diff --git a/net/dccp/diag.c b/net/dccp/diag.c index f675d8e642d3..3f78c00e3822 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -28,7 +28,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; - info->tcpi_pmtu = dp->dccps_pmtu_cookie; + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; if (dp->dccps_options.dccpo_send_ack_vector) info->tcpi_options |= TCPI_OPT_SACK; diff --git a/net/dccp/input.c b/net/dccp/input.c index 3454d5941900..b6cba72b44e8 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -151,29 +151,12 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) return 0; } -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned len) +static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) { struct dccp_sock *dp = dccp_sk(sk); - if (dccp_check_seqno(sk, skb)) - goto discard; - - if (dccp_parse_options(sk, skb)) - goto discard; - - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_event_ack_recv(sk, skb); - - if (dp->dccps_options.dccpo_send_ack_vector && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) - goto discard; - - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); - switch (dccp_hdr(skb)->dccph_type) { case DCCP_PKT_DATAACK: case DCCP_PKT_DATA: @@ -250,6 +233,37 @@ discard: return 0; } +int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (dccp_check_seqno(sk, skb)) + goto discard; + + if (dccp_parse_options(sk, skb)) + goto discard; + + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + if (dp->dccps_options.dccpo_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto discard; + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + + return __dccp_rcv_established(sk, skb, dh, len); +discard: + __kfree_skb(skb); + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_rcv_established); + static int dccp_rcv_request_sent_state_process(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, @@ -286,6 +300,12 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } + if (dp->dccps_options.dccpo_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto out_invalid_packet; /* FIXME: change error code */ + dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_update_gsr(sk, dp->dccps_isr); /* @@ -309,7 +329,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } - dccp_sync_mss(sk, dp->dccps_pmtu_cookie); + dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); /* * Step 10: Process REQUEST state (second part) @@ -329,7 +349,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, dccp_set_state(sk, DCCP_PARTOPEN); /* Make sure socket is routed, for correct metrics. */ - inet_sk_rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); @@ -398,9 +418,9 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk, if (dh->dccph_type == DCCP_PKT_DATAACK || dh->dccph_type == DCCP_PKT_DATA) { - dccp_rcv_established(sk, skb, dh, len); + __dccp_rcv_established(sk, skb, dh, len); queued = 1; /* packet was queued - (by dccp_rcv_established) */ + (by __dccp_rcv_established) */ } break; } @@ -444,7 +464,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { - if (dccp_v4_conn_request(sk, skb) < 0) + if (inet_csk(sk)->icsk_af_ops->conn_request(sk, + skb) < 0) return 1; /* FIXME: do congestion control initialization */ @@ -471,14 +492,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); - if (dp->dccps_options.dccpo_send_ack_vector && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) goto discard; + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); } /* @@ -566,3 +587,5 @@ discard: } return 0; } + +EXPORT_SYMBOL_GPL(dccp_rcv_state_process); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 656e13e38cfb..00f983226672 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -19,7 +19,9 @@ #include <net/icmp.h> #include <net/inet_hashtables.h> +#include <net/inet_sock.h> #include <net/sock.h> +#include <net/timewait_sock.h> #include <net/tcp_states.h> #include <net/xfrm.h> @@ -37,7 +39,8 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo); static int dccp_v4_get_port(struct sock *sk, const unsigned short snum) { - return inet_csk_get_port(&dccp_hashinfo, sk, snum); + return inet_csk_get_port(&dccp_hashinfo, sk, snum, + inet_csk_bind_conflict); } static void dccp_v4_hash(struct sock *sk) @@ -45,171 +48,14 @@ static void dccp_v4_hash(struct sock *sk) inet_hash(&dccp_hashinfo, sk); } -static void dccp_v4_unhash(struct sock *sk) +void dccp_unhash(struct sock *sk) { inet_unhash(&dccp_hashinfo, sk); } -/* called with local bh disabled */ -static int __dccp_v4_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const u32 daddr = inet->rcv_saddr; - const u32 saddr = inet->daddr; - const int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash); - const struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { - tw = inet_twsk(sk2); - - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } +EXPORT_SYMBOL_GPL(dccp_unhash); - /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); - sk->sk_hash = hash; - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp != NULL) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw != NULL) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &dccp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -/* - * Bind a port for a connect operation and hash it. - */ -static int dccp_v4_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - do { - head = &dccp_hashinfo.bhash[inet_bhashfn(rover, - dccp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == rover) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__dccp_v4_check_established(sk, - rover, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, - head, rover); - if (tb == NULL) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - if (++rover > high) - rover = low; - } while (--remaining > 0); - - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - /* All locks still held and bhs disabled */ - inet_bind_hash(sk, tb, rover); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(rover); - __inet_hash(&dccp_hashinfo, sk, 0); - } - spin_unlock(&head->lock); - - if (tw != NULL) { - inet_twsk_deschedule(tw, &dccp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &dccp_hashinfo.bhash[inet_bhashfn(snum, - dccp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { - __inet_hash(&dccp_hashinfo, sk, 0); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __dccp_v4_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - -static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) +int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); @@ -259,9 +105,9 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, inet->dport = usin->sin_port; inet->daddr = daddr; - dp->dccps_ext_header_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt != NULL) - dp->dccps_ext_header_len = inet->opt->optlen; + inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; /* * Socket identity is still unknown (sport may be zero). * However we set state to DCCP_REQUESTING and not releasing socket @@ -269,7 +115,7 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, * complete initialization after this. */ dccp_set_state(sk, DCCP_REQUESTING); - err = dccp_v4_hash_connect(sk); + err = inet_hash_connect(&dccp_death_row, sk); if (err != 0) goto failure; @@ -287,16 +133,6 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, usin->sin_port); dccp_update_gss(sk, dp->dccps_iss); - /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. - */ - dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss)); - inet->id = dp->dccps_iss ^ jiffies; err = dccp_connect(sk); @@ -316,6 +152,8 @@ failure: goto out; } +EXPORT_SYMBOL_GPL(dccp_v4_connect); + /* * This routine does path mtu discovery as defined in RFC1191. */ @@ -354,7 +192,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk, mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && - dp->dccps_pmtu_cookie > mtu) { + inet_csk(sk)->icsk_pmtu_cookie > mtu) { dccp_sync_mss(sk, mtu); /* @@ -606,6 +444,17 @@ out: sock_put(sk); } +/* This routine computes an IPv4 DCCP checksum. */ +void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) +{ + const struct inet_sock *inet = inet_sk(sk); + struct dccp_hdr *dh = dccp_hdr(skb); + + dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr); +} + +EXPORT_SYMBOL_GPL(dccp_v4_send_check); + int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) { struct sk_buff *skb; @@ -641,16 +490,6 @@ static inline u64 dccp_v4_init_sequence(const struct sock *sk, dccp_hdr(skb)->dccph_sport); } -static inline int dccp_bad_service_code(const struct sock *sk, - const __u32 service) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - if (dp->dccps_service == service) - return 0; - return !dccp_list_has_service(dp->dccps_service_list, service); -} - int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct inet_request_sock *ireq; @@ -662,7 +501,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) const __u32 service = dccp_hdr_request(skb)->dccph_req_service; struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY; - struct dst_entry *dst = NULL; /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ if (((struct rtable *)skb->dst)->rt_flags & @@ -703,7 +541,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; - /* FIXME: Merge Aristeu's option parsing code when ready */ req->rcv_wnd = 100; /* Fake, option parsing will get the right value */ ireq->opt = NULL; @@ -721,23 +558,22 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) dreq->dreq_iss = dccp_v4_init_sequence(sk, skb); dreq->dreq_service = service; - if (dccp_v4_send_response(sk, req, dst)) + if (dccp_v4_send_response(sk, req, NULL)) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); return 0; drop_and_free: - /* - * FIXME: should be reqsk_free after implementing req->rsk_ops - */ - __reqsk_free(req); + reqsk_free(req); drop: DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); dcb->dccpd_reset_code = reset_code; return -1; } +EXPORT_SYMBOL_GPL(dccp_v4_conn_request); + /* * The three way handshake has completed - we got a valid ACK or DATAACK - * now create the new socket. @@ -792,6 +628,8 @@ exit: return NULL; } +EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); + static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); @@ -1011,7 +849,9 @@ discard: return 0; } -static inline int dccp_invalid_packet(struct sk_buff *skb) +EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); + +int dccp_invalid_packet(struct sk_buff *skb) { const struct dccp_hdr *dh; @@ -1065,29 +905,30 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) return 1; } - /* If the header checksum is incorrect, drop packet and return */ - if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, - skb->nh.iph->daddr) < 0) { - LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is " - "incorrect\n"); - return 1; - } - return 0; } +EXPORT_SYMBOL_GPL(dccp_invalid_packet); + /* this is called when real data arrives */ int dccp_v4_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; struct sock *sk; - int rc; /* Step 1: Check header basics: */ if (dccp_invalid_packet(skb)) goto discard_it; + /* If the header checksum is incorrect, drop packet and return */ + if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, + skb->nh.iph->daddr) < 0) { + LIMIT_NETDEBUG(KERN_WARNING "%s: incorrect header checksum\n", + __FUNCTION__); + goto discard_it; + } + dh = dccp_hdr(skb); DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); @@ -1143,28 +984,11 @@ int dccp_v4_rcv(struct sk_buff *skb) goto do_time_wait; } - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { - dccp_pr_debug("xfrm4_policy_check failed\n"); + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - } - - if (sk_filter(sk, skb, 0)) { - dccp_pr_debug("sk_filter failed\n"); - goto discard_and_relse; - } - - skb->dev = NULL; - - bh_lock_sock(sk); - rc = 0; - if (!sock_owned_by_user(sk)) - rc = dccp_v4_do_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); + nf_reset(skb); - sock_put(sk); - return rc; + return sk_receive_skb(sk, skb); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -1194,9 +1018,23 @@ do_time_wait: goto no_dccp_socket; } -static int dccp_v4_init_sock(struct sock *sk) +struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { + .queue_xmit = ip_queue_xmit, + .send_check = dccp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = dccp_v4_conn_request, + .syn_recv_sock = dccp_v4_request_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .addr2sockaddr = inet_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in), +}; + +int dccp_v4_init_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); static int dccp_ctl_socket_init = 1; dccp_options_init(&dp->dccps_options); @@ -1236,9 +1074,11 @@ static int dccp_v4_init_sock(struct sock *sk) dccp_ctl_socket_init = 0; dccp_init_xmit_timers(sk); - inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT; + icsk->icsk_rto = DCCP_TIMEOUT_INIT; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; + icsk->icsk_af_ops = &dccp_ipv4_af_ops; + icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_INVALID_VALUE; @@ -1246,7 +1086,9 @@ static int dccp_v4_init_sock(struct sock *sk) return 0; } -static int dccp_v4_destroy_sock(struct sock *sk) +EXPORT_SYMBOL_GPL(dccp_v4_init_sock); + +int dccp_v4_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); @@ -1279,6 +1121,8 @@ static int dccp_v4_destroy_sock(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(dccp_v4_destroy_sock); + static void dccp_v4_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->opt); @@ -1293,7 +1137,11 @@ static struct request_sock_ops dccp_request_sock_ops = { .send_reset = dccp_v4_ctl_send_reset, }; -struct proto dccp_v4_prot = { +static struct timewait_sock_ops dccp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct inet_timewait_sock), +}; + +struct proto dccp_prot = { .name = "DCCP", .owner = THIS_MODULE, .close = dccp_close, @@ -1307,7 +1155,7 @@ struct proto dccp_v4_prot = { .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v4_do_rcv, .hash = dccp_v4_hash, - .unhash = dccp_v4_unhash, + .unhash = dccp_unhash, .accept = inet_csk_accept, .get_port = dccp_v4_get_port, .shutdown = dccp_shutdown, @@ -1316,5 +1164,7 @@ struct proto dccp_v4_prot = { .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp_sock), .rsk_prot = &dccp_request_sock_ops, - .twsk_obj_size = sizeof(struct inet_timewait_sock), + .twsk_prot = &dccp_timewait_sock_ops, }; + +EXPORT_SYMBOL_GPL(dccp_prot); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c new file mode 100644 index 000000000000..df074259f9c3 --- /dev/null +++ b/net/dccp/ipv6.c @@ -0,0 +1,1262 @@ +/* + * DCCP over IPv6 + * Linux INET6 implementation + * + * Based on net/dccp6/ipv6.c + * + * Arnaldo Carvalho de Melo <acme@ghostprotocols.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/xfrm.h> + +#include <net/addrconf.h> +#include <net/inet_common.h> +#include <net/inet_hashtables.h> +#include <net/inet_sock.h> +#include <net/inet6_connection_sock.h> +#include <net/inet6_hashtables.h> +#include <net/ip6_route.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/ip6_checksum.h> +#include <net/xfrm.h> + +#include "dccp.h" +#include "ipv6.h" + +static void dccp_v6_ctl_send_reset(struct sk_buff *skb); +static void dccp_v6_reqsk_send_ack(struct sk_buff *skb, + struct request_sock *req); +static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb); + +static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); + +static struct inet_connection_sock_af_ops dccp_ipv6_mapped; +static struct inet_connection_sock_af_ops dccp_ipv6_af_ops; + +static int dccp_v6_get_port(struct sock *sk, unsigned short snum) +{ + return inet_csk_get_port(&dccp_hashinfo, sk, snum, + inet6_csk_bind_conflict); +} + +static void dccp_v6_hash(struct sock *sk) +{ + if (sk->sk_state != DCCP_CLOSED) { + if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) { + dccp_prot.hash(sk); + return; + } + local_bh_disable(); + __inet6_hash(&dccp_hashinfo, sk); + local_bh_enable(); + } +} + +static inline u16 dccp_v6_check(struct dccp_hdr *dh, int len, + struct in6_addr *saddr, + struct in6_addr *daddr, + unsigned long base) +{ + return csum_ipv6_magic(saddr, daddr, len, IPPROTO_DCCP, base); +} + +static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + + if (skb->protocol == htons(ETH_P_IPV6)) + return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32, + skb->nh.ipv6h->saddr.s6_addr32, + dh->dccph_dport, + dh->dccph_sport); + else + return secure_dccp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + dh->dccph_dport, + dh->dccph_sport); +} + +static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + struct in6_addr *saddr = NULL, *final_p = NULL, final; + struct flowi fl; + struct dst_entry *dst; + int addr_type; + int err; + + dp->dccps_role = DCCP_ROLE_CLIENT; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + memset(&fl, 0, sizeof(fl)); + + if (np->sndflow) { + fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl.fl6_flowlabel); + if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) { + struct ip6_flowlabel *flowlabel; + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + fl6_sock_release(flowlabel); + } + } + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if (ipv6_addr_any(&usin->sin6_addr)) + usin->sin6_addr.s6_addr[15] = 0x1; + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if(addr_type & IPV6_ADDR_MULTICAST) + return -ENETUNREACH; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + /* If interface is set while binding, indices + * must coincide. + */ + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) + return -EINVAL; + + sk->sk_bound_dev_if = usin->sin6_scope_id; + } + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + return -EINVAL; + } + + ipv6_addr_copy(&np->daddr, &usin->sin6_addr); + np->flow_label = fl.fl6_flowlabel; + + /* + * DCCP over IPv4 + */ + + if (addr_type == IPV6_ADDR_MAPPED) { + u32 exthdrlen = icsk->icsk_ext_hdr_len; + struct sockaddr_in sin; + + SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); + + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + + sin.sin_family = AF_INET; + sin.sin_port = usin->sin6_port; + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; + + icsk->icsk_af_ops = &dccp_ipv6_mapped; + sk->sk_backlog_rcv = dccp_v4_do_rcv; + + err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + + if (err) { + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &dccp_ipv6_af_ops; + sk->sk_backlog_rcv = dccp_v6_do_rcv; + goto failure; + } else { + ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF), + inet->saddr); + ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF), + inet->rcv_saddr); + } + + return err; + } + + if (!ipv6_addr_any(&np->rcv_saddr)) + saddr = &np->rcv_saddr; + + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = usin->sin6_port; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto failure; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto failure; + + if (saddr == NULL) { + saddr = &fl.fl6_src; + ipv6_addr_copy(&np->rcv_saddr, saddr); + } + + /* set the source address */ + ipv6_addr_copy(&np->saddr, saddr); + inet->rcv_saddr = LOOPBACK4_IPV6; + + ip6_dst_store(sk, dst, NULL); + + icsk->icsk_ext_hdr_len = 0; + if (np->opt) + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); + + inet->dport = usin->sin6_port; + + dccp_set_state(sk, DCCP_REQUESTING); + err = inet6_hash_connect(&dccp_death_row, sk); + if (err) + goto late_failure; + /* FIXME */ +#if 0 + dp->dccps_gar = secure_dccp_v6_sequence_number(np->saddr.s6_addr32, + np->daddr.s6_addr32, + inet->sport, + inet->dport); +#endif + err = dccp_connect(sk); + if (err) + goto late_failure; + + return 0; + +late_failure: + dccp_set_state(sk, DCCP_CLOSED); + __sk_dst_reset(sk); +failure: + inet->dport = 0; + sk->sk_route_caps = 0; + return err; +} + +static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data; + const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset); + struct ipv6_pinfo *np; + struct sock *sk; + int err; + __u64 seq; + + sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport, + &hdr->saddr, dh->dccph_sport, skb->dev->ifindex); + + if (sk == NULL) { + ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return; + } + + if (sk->sk_state == DCCP_TIME_WAIT) { + inet_twsk_put((struct inet_timewait_sock *)sk); + return; + } + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == DCCP_CLOSED) + goto out; + + np = inet6_sk(sk); + + if (type == ICMPV6_PKT_TOOBIG) { + struct dst_entry *dst = NULL; + + if (sock_owned_by_user(sk)) + goto out; + if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) + goto out; + + /* icmp should have updated the destination cache entry */ + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct flowi fl; + + /* BUGGG_FUTURE: Again, it is not clear how + to handle rthdr case. Ignore this complexity + for now. + */ + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if ((err = ip6_dst_lookup(sk, &dst, &fl))) { + sk->sk_err_soft = -err; + goto out; + } + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + goto out; + } + + } else + dst_hold(dst); + + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { + dccp_sync_mss(sk, dst_mtu(dst)); + } /* else let the usual retransmit timer handle it */ + dst_release(dst); + goto out; + } + + icmpv6_err_convert(type, code, &err); + + seq = DCCP_SKB_CB(skb)->dccpd_seq; + /* Might be for an request_sock */ + switch (sk->sk_state) { + struct request_sock *req, **prev; + case DCCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = inet6_csk_search_req(sk, &prev, dh->dccph_dport, + &hdr->daddr, &hdr->saddr, + inet6_iif(skb)); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + BUG_TRAP(req->sk == NULL); + + if (seq != dccp_rsk(req)->dreq_iss) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case DCCP_REQUESTING: + case DCCP_RESPOND: /* Cannot happen. + It can, it SYNs are crossed. --ANK */ + if (!sock_owned_by_user(sk)) { + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + /* + * Wake people up to see the error + * (see connect in sock.c) + */ + sk->sk_error_report(sk); + + dccp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + if (!sock_owned_by_user(sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else + sk->sk_err_soft = err; + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + + +static int dccp_v6_send_response(struct sock *sk, struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *ireq6 = inet6_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct ipv6_txoptions *opt = NULL; + struct in6_addr *final_p = NULL, final; + struct flowi fl; + int err = -1; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); + fl.fl6_flowlabel = 0; + fl.oif = ireq6->iif; + fl.fl_ip_dport = inet_rsk(req)->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (dst == NULL) { + opt = np->opt; + if (opt == NULL && + np->rxopt.bits.osrcrt == 2 && + ireq6->pktopts) { + struct sk_buff *pktopts = ireq6->pktopts; + struct inet6_skb_parm *rxopt = IP6CB(pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, + (struct ipv6_rt_hdr *)(pktopts->nh.raw + + rxopt->srcrt)); + } + + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto done; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto done; + } + + skb = dccp_make_response(sk, dst, req); + if (skb != NULL) { + struct dccp_hdr *dh = dccp_hdr(skb); + dh->dccph_checksum = dccp_v6_check(dh, skb->len, + &ireq6->loc_addr, + &ireq6->rmt_addr, + csum_partial((char *)dh, + skb->len, + skb->csum)); + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + err = ip6_xmit(sk, skb, &fl, opt, 0); + if (err == NET_XMIT_CN) + err = 0; + } + +done: + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + return err; +} + +static void dccp_v6_reqsk_destructor(struct request_sock *req) +{ + if (inet6_rsk(req)->pktopts != NULL) + kfree_skb(inet6_rsk(req)->pktopts); +} + +static struct request_sock_ops dccp6_request_sock_ops = { + .family = AF_INET6, + .obj_size = sizeof(struct dccp6_request_sock), + .rtx_syn_ack = dccp_v6_send_response, + .send_ack = dccp_v6_reqsk_send_ack, + .destructor = dccp_v6_reqsk_destructor, + .send_reset = dccp_v6_ctl_send_reset, +}; + +static struct timewait_sock_ops dccp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct dccp6_timewait_sock), +}; + +static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct dccp_hdr *dh = dccp_hdr(skb); + + dh->dccph_checksum = csum_ipv6_magic(&np->saddr, &np->daddr, + len, IPPROTO_DCCP, + csum_partial((char *)dh, + dh->dccph_doff << 2, + skb->csum)); +} + +static void dccp_v6_ctl_send_reset(struct sk_buff *rxskb) +{ + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_reset); + struct sk_buff *skb; + struct flowi fl; + u64 seqno; + + if (rxdh->dccph_type == DCCP_PKT_RESET) + return; + + if (!ipv6_unicast_destination(rxskb)) + return; + + /* + * We need to grab some memory, and put together an RST, + * and then put it into the queue to be sent. + */ + + skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_reset_len, GFP_ATOMIC); + if (skb == NULL) + return; + + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_reset_len); + + skb->h.raw = skb_push(skb, dccp_hdr_reset_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_reset_len); + + /* Swap the send and the receive. */ + dh->dccph_type = DCCP_PKT_RESET; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_reset_len / 4; + dh->dccph_x = 1; + dccp_hdr_reset(skb)->dccph_reset_code = + DCCP_SKB_CB(rxskb)->dccpd_reset_code; + + /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */ + seqno = 0; + if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1); + + dccp_hdr_set_seq(dh, seqno); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr); + dh->dccph_checksum = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, + sizeof(*dh), IPPROTO_DCCP, + skb->csum); + fl.proto = IPPROTO_DCCP; + fl.oif = inet6_iif(rxskb); + fl.fl_ip_dport = dh->dccph_dport; + fl.fl_ip_sport = dh->dccph_sport; + + /* sk = NULL, but it is safe for now. RST socket required. */ + if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) { + if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, skb, &fl, NULL, 0); + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + return; + } + } + + kfree_skb(skb); +} + +static void dccp_v6_ctl_send_ack(struct sk_buff *rxskb) +{ + struct flowi fl; + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_ack_bits); + struct sk_buff *skb; + + skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_ack_len, GFP_ATOMIC); + if (skb == NULL) + return; + + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_ack_len); + + skb->h.raw = skb_push(skb, dccp_hdr_ack_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_ack_len); + + /* Build DCCP header and checksum it. */ + dh->dccph_type = DCCP_PKT_ACK; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_ack_len / 4; + dh->dccph_x = 1; + + dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr); + + /* FIXME: calculate checksum, IPv4 also should... */ + + fl.proto = IPPROTO_DCCP; + fl.oif = inet6_iif(rxskb); + fl.fl_ip_dport = dh->dccph_dport; + fl.fl_ip_sport = dh->dccph_sport; + + if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) { + if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, skb, &fl, NULL, 0); + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + return; + } + } + + kfree_skb(skb); +} + +static void dccp_v6_reqsk_send_ack(struct sk_buff *skb, + struct request_sock *req) +{ + dccp_v6_ctl_send_ack(skb); +} + +static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + const struct ipv6hdr *iph = skb->nh.ipv6h; + struct sock *nsk; + struct request_sock **prev; + /* Find possible connection requests. */ + struct request_sock *req = inet6_csk_search_req(sk, &prev, + dh->dccph_sport, + &iph->saddr, + &iph->daddr, + inet6_iif(skb)); + if (req != NULL) + return dccp_check_req(sk, skb, req, prev); + + nsk = __inet6_lookup_established(&dccp_hashinfo, + &iph->saddr, dh->dccph_sport, + &iph->daddr, ntohs(dh->dccph_dport), + inet6_iif(skb)); + + if (nsk != NULL) { + if (nsk->sk_state != DCCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put((struct inet_timewait_sock *)nsk); + return NULL; + } + + return sk; +} + +static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct inet_request_sock *ireq; + struct dccp_sock dp; + struct request_sock *req; + struct dccp_request_sock *dreq; + struct inet6_request_sock *ireq6; + struct ipv6_pinfo *np = inet6_sk(sk); + const __u32 service = dccp_hdr_request(skb)->dccph_req_service; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY; + + if (skb->protocol == htons(ETH_P_IP)) + return dccp_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + if (dccp_bad_service_code(sk, service)) { + reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; + goto drop; + } + /* + * There are no SYN attacks on IPv6, yet... + */ + if (inet_csk_reqsk_queue_is_full(sk)) + goto drop; + + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = inet6_reqsk_alloc(sk->sk_prot->rsk_prot); + if (req == NULL) + goto drop; + + /* FIXME: process options */ + + dccp_openreq_init(req, &dp, skb); + + ireq6 = inet6_rsk(req); + ireq = inet_rsk(req); + ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr); + req->rcv_wnd = 100; /* Fake, option parsing will get the + right value */ + ireq6->pktopts = NULL; + + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + atomic_inc(&skb->users); + ireq6->pktopts = skb; + } + ireq6->iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq6->iif = inet6_iif(skb); + + /* + * Step 3: Process LISTEN state + * + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * + * In fact we defer setting S.GSR, S.SWL, S.SWH to + * dccp_create_openreq_child. + */ + dreq = dccp_rsk(req); + dreq->dreq_isr = dcb->dccpd_seq; + dreq->dreq_iss = dccp_v6_init_sequence(sk, skb); + dreq->dreq_service = service; + + if (dccp_v6_send_response(sk, req, NULL)) + goto drop_and_free; + + inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + return 0; + +drop_and_free: + reqsk_free(req); +drop: + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + dcb->dccpd_reset_code = reset_code; + return -1; +} + +static struct sock *dccp_v6_request_recv_sock(struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *ireq6 = inet6_rsk(req); + struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct inet_sock *newinet; + struct dccp_sock *newdp; + struct dccp6_sock *newdp6; + struct sock *newsk; + struct ipv6_txoptions *opt; + + if (skb->protocol == htons(ETH_P_IP)) { + /* + * v6 mapped + */ + + newsk = dccp_v4_request_recv_sock(sk, skb, req, dst); + if (newsk == NULL) + return NULL; + + newdp6 = (struct dccp6_sock *)newsk; + newdp = dccp_sk(newsk); + newinet = inet_sk(newsk); + newinet->pinet6 = &newdp6->inet6; + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF), + newinet->daddr); + + ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF), + newinet->saddr); + + ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + + inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; + newsk->sk_backlog_rcv = dccp_v4_do_rcv; + newnp->pktoptions = NULL; + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks count + * here, dccp_create_openreq_child now does this for us, see the comment in + * that function for the gory details. -acme + */ + + /* It is tricky place. Until this moment IPv4 tcp + worked with IPv6 icsk.icsk_af_ops. + Sync it now. + */ + dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); + + return newsk; + } + + opt = np->opt; + + if (sk_acceptq_is_full(sk)) + goto out_overflow; + + if (np->rxopt.bits.osrcrt == 2 && + opt == NULL && ireq6->pktopts) { + struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, + (struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw + + rxopt->srcrt)); + } + + if (dst == NULL) { + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet_rsk(req)->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (ip6_dst_lookup(sk, &dst, &fl)) + goto out; + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto out; + } + + newsk = dccp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto out; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, dccp_create_openreq_child now does this for us, see the + * comment in that function for the gory details. -acme + */ + + ip6_dst_store(newsk, dst, NULL); + newsk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + + newdp6 = (struct dccp6_sock *)newsk; + newinet = inet_sk(newsk); + newinet->pinet6 = &newdp6->inet6; + newdp = dccp_sk(newsk); + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr); + ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr); + ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr); + newsk->sk_bound_dev_if = ireq6->iif; + + /* Now IPv6 options... + + First: no IPv4 options. + */ + newinet->opt = NULL; + + /* Clone RX bits */ + newnp->rxopt.all = np->rxopt.all; + + /* Clone pktoptions received with SYN */ + newnp->pktoptions = NULL; + if (ireq6->pktopts != NULL) { + newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC); + kfree_skb(ireq6->pktopts); + ireq6->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* Clone native IPv6 options from listening socket (if any) + + Yes, keeping reference count would be much more clever, + but we make one more one thing there: reattach optmem + to newsk. + */ + if (opt) { + newnp->opt = ipv6_dup_options(newsk, opt); + if (opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + } + + inet_csk(newsk)->icsk_ext_hdr_len = 0; + if (newnp->opt) + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); + + dccp_sync_mss(newsk, dst_mtu(dst)); + + newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; + + __inet6_hash(&dccp_hashinfo, newsk); + inet_inherit_port(&dccp_hashinfo, sk, newsk); + + return newsk; + +out_overflow: + NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); +out: + NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + dst_release(dst); + return NULL; +} + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *opt_skb = NULL; + + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. + From backlog it always goes here. Kerboom... + Fortunately, dccp_rcv_established and rcv_established + handle them correctly, but it is not case with + dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK + */ + + if (skb->protocol == htons(ETH_P_IP)) + return dccp_v4_do_rcv(sk, skb); + + if (sk_filter(sk, skb, 0)) + goto discard; + + /* + * socket locking is here for SMP purposes as backlog rcv + * is currently called with bh processing disabled. + */ + + /* Do Stevens' IPV6_PKTOPTIONS. + + Yes, guys, it is the only place in our code, where we + may make it not affecting IPv4. + The rest of code is protocol independent, + and I do not like idea to uglify IPv4. + + Actually, all the idea behind IPV6_PKTOPTIONS + looks not very well thought. For now we latch + options, received in the last packet, enqueued + by tcp. Feel free to propose better solution. + --ANK (980728) + */ + if (np->rxopt.all) + opt_skb = skb_clone(skb, GFP_ATOMIC); + + if (sk->sk_state == DCCP_OPEN) { /* Fast path */ + if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) + goto reset; + return 0; + } + + if (sk->sk_state == DCCP_LISTEN) { + struct sock *nsk = dccp_v6_hnd_req(sk, skb); + if (!nsk) + goto discard; + + /* + * Queue it on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket.. + */ + if(nsk != sk) { + if (dccp_child_process(sk, nsk, skb)) + goto reset; + if (opt_skb) + __kfree_skb(opt_skb); + return 0; + } + } + + if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) + goto reset; + return 0; + +reset: + dccp_v6_ctl_send_reset(skb); +discard: + if (opt_skb) + __kfree_skb(opt_skb); + kfree_skb(skb); + return 0; +} + +static int dccp_v6_rcv(struct sk_buff **pskb) +{ + const struct dccp_hdr *dh; + struct sk_buff *skb = *pskb; + struct sock *sk; + + /* Step 1: Check header basics: */ + + if (dccp_invalid_packet(skb)) + goto discard_it; + + dh = dccp_hdr(skb); + + DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); + DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; + + if (dccp_packet_without_ack(skb)) + DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; + else + DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); + + /* Step 2: + * Look up flow ID in table and get corresponding socket */ + sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr, + dh->dccph_sport, + &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport), + inet6_iif(skb)); + /* + * Step 2: + * If no socket ... + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (sk == NULL) + goto no_dccp_socket; + + /* + * Step 2: + * ... or S.state == TIMEWAIT, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + + if (sk->sk_state == DCCP_TIME_WAIT) + goto do_time_wait; + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + + return sk_receive_skb(sk, skb) ? -1 : 0; + +no_dccp_socket: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + /* + * Step 2: + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (dh->dccph_type != DCCP_PKT_RESET) { + DCCP_SKB_CB(skb)->dccpd_reset_code = + DCCP_RESET_CODE_NO_CONNECTION; + dccp_v6_ctl_send_reset(skb); + } +discard_it: + + /* + * Discard frame + */ + + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + inet_twsk_put((struct inet_timewait_sock *)sk); + goto no_dccp_socket; +} + +static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { + .queue_xmit = inet6_csk_xmit, + .send_check = dccp_v6_send_check, + .rebuild_header = inet6_sk_rebuild_header, + .conn_request = dccp_v6_conn_request, + .syn_recv_sock = dccp_v6_request_recv_sock, + .net_header_len = sizeof(struct ipv6hdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + +/* + * DCCP over IPv4 via INET6 API + */ +static struct inet_connection_sock_af_ops dccp_ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = dccp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = dccp_v6_conn_request, + .syn_recv_sock = dccp_v6_request_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int dccp_v6_init_sock(struct sock *sk) +{ + int err = dccp_v4_init_sock(sk); + + if (err == 0) + inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; + + return err; +} + +static int dccp_v6_destroy_sock(struct sock *sk) +{ + dccp_v4_destroy_sock(sk); + return inet6_destroy_sock(sk); +} + +static struct proto dccp_v6_prot = { + .name = "DCCPv6", + .owner = THIS_MODULE, + .close = dccp_close, + .connect = dccp_v6_connect, + .disconnect = dccp_disconnect, + .ioctl = dccp_ioctl, + .init = dccp_v6_init_sock, + .setsockopt = dccp_setsockopt, + .getsockopt = dccp_getsockopt, + .sendmsg = dccp_sendmsg, + .recvmsg = dccp_recvmsg, + .backlog_rcv = dccp_v6_do_rcv, + .hash = dccp_v6_hash, + .unhash = dccp_unhash, + .accept = inet_csk_accept, + .get_port = dccp_v6_get_port, + .shutdown = dccp_shutdown, + .destroy = dccp_v6_destroy_sock, + .orphan_count = &dccp_orphan_count, + .max_header = MAX_DCCP_HEADER, + .obj_size = sizeof(struct dccp6_sock), + .rsk_prot = &dccp6_request_sock_ops, + .twsk_prot = &dccp6_timewait_sock_ops, +}; + +static struct inet6_protocol dccp_v6_protocol = { + .handler = dccp_v6_rcv, + .err_handler = dccp_v6_err, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, +}; + +static struct proto_ops inet6_dccp_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet6_getname, + .poll = dccp_poll, + .ioctl = inet6_ioctl, + .listen = inet_dccp_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct inet_protosw dccp_v6_protosw = { + .type = SOCK_DCCP, + .protocol = IPPROTO_DCCP, + .prot = &dccp_v6_prot, + .ops = &inet6_dccp_ops, + .capability = -1, + .flags = INET_PROTOSW_ICSK, +}; + +static int __init dccp_v6_init(void) +{ + int err = proto_register(&dccp_v6_prot, 1); + + if (err != 0) + goto out; + + err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP); + if (err != 0) + goto out_unregister_proto; + + inet6_register_protosw(&dccp_v6_protosw); +out: + return err; +out_unregister_proto: + proto_unregister(&dccp_v6_prot); + goto out; +} + +static void __exit dccp_v6_exit(void) +{ + inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); + inet6_unregister_protosw(&dccp_v6_protosw); + proto_unregister(&dccp_v6_prot); +} + +module_init(dccp_v6_init); +module_exit(dccp_v6_exit); + +/* + * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) + * values directly, Also cover the case where the protocol is not specified, + * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP + */ +MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-33-type-6"); +MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-0-type-6"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); +MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h new file mode 100644 index 000000000000..e4d4e9309270 --- /dev/null +++ b/net/dccp/ipv6.h @@ -0,0 +1,37 @@ +#ifndef _DCCP_IPV6_H +#define _DCCP_IPV6_H +/* + * net/dccp/ipv6.h + * + * An implementation of the DCCP protocol + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/dccp.h> +#include <linux/ipv6.h> + +struct dccp6_sock { + struct dccp_sock dccp; + /* + * ipv6_pinfo has to be the last member of dccp6_sock, + * see inet6_sk_generic. + */ + struct ipv6_pinfo inet6; +}; + +struct dccp6_request_sock { + struct dccp_request_sock dccp; + struct inet6_request_sock inet6; +}; + +struct dccp6_timewait_sock { + struct inet_timewait_sock inet; + struct inet6_timewait_sock tw6; +}; + +#endif /* _DCCP_IPV6_H */ diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 1393461898bb..29261fc198e7 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -40,6 +40,8 @@ struct inet_timewait_death_row dccp_death_row = { (unsigned long)&dccp_death_row), }; +EXPORT_SYMBOL_GPL(dccp_death_row); + void dccp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; @@ -50,7 +52,18 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) if (tw != NULL) { const struct inet_connection_sock *icsk = inet_csk(sk); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); - +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (tw->tw_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_timewait_sock *tw6; + + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6only = np->ipv6only; + } +#endif /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); @@ -170,6 +183,8 @@ out_free: return newsk; } +EXPORT_SYMBOL_GPL(dccp_create_openreq_child); + /* * Process an incoming packet for RESPOND sockets represented * as an request_sock. @@ -214,7 +229,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, goto drop; } - child = dccp_v4_request_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; @@ -236,6 +251,8 @@ drop: goto out; } +EXPORT_SYMBOL_GPL(dccp_check_req); + /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with @@ -266,3 +283,5 @@ int dccp_child_process(struct sock *parent, struct sock *child, sock_put(child); return ret; } + +EXPORT_SYMBOL_GPL(dccp_child_process); diff --git a/net/dccp/output.c b/net/dccp/output.c index 74ff87025878..efd7ffb903a1 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -15,6 +15,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> +#include <net/inet_sock.h> #include <net/sock.h> #include "ackvec.h" @@ -43,6 +44,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) { if (likely(skb != NULL)) { const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); struct dccp_hdr *dh; @@ -108,8 +110,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) break; } - dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, - inet->daddr); + icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (set_ack) dccp_event_ack_sent(sk); @@ -117,7 +118,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) DCCP_INC_STATS(DCCP_MIB_OUTSEGS); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - err = ip_queue_xmit(skb, 0); + err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (err <= 0) return err; @@ -134,20 +135,13 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) { + struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); - int mss_now; - - /* - * FIXME: we really should be using the af_specific thing to support - * IPv6. - * mss_now = pmtu - tp->af_specific->net_header_len - - * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); - */ - mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) - - sizeof(struct dccp_hdr_ext); + int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - + sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext)); /* Now subtract optional transport overhead */ - mss_now -= dp->dccps_ext_header_len; + mss_now -= icsk->icsk_ext_hdr_len; /* * FIXME: this should come from the CCID infrastructure, where, say, @@ -160,12 +154,14 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; /* And store cached results */ - dp->dccps_pmtu_cookie = pmtu; + icsk->icsk_pmtu_cookie = pmtu; dp->dccps_mss_cache = mss_now; return mss_now; } +EXPORT_SYMBOL_GPL(dccp_sync_mss); + void dccp_write_space(struct sock *sk) { read_lock(&sk->sk_callback_lock); @@ -266,7 +262,7 @@ int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo) int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { - if (inet_sk_rebuild_header(sk) != 0) + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0) return -EHOSTUNREACH; /* Routing failure or similar. */ return dccp_transmit_skb(sk, (skb_cloned(skb) ? @@ -321,6 +317,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, return skb; } +EXPORT_SYMBOL_GPL(dccp_make_response); + struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, const enum dccp_reset_codes code) @@ -377,6 +375,7 @@ struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, */ static inline void dccp_connect_init(struct sock *sk) { + struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -385,10 +384,16 @@ static inline void dccp_connect_init(struct sock *sk) dccp_sync_mss(sk, dst_mtu(dst)); - /* - * FIXME: set dp->{dccps_swh,dccps_swl}, with - * something like dccp_inc_seq - */ + dccp_update_gss(sk, dp->dccps_iss); + /* + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + */ + dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss)); icsk->icsk_retransmits = 0; } @@ -420,6 +425,8 @@ int dccp_connect(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(dccp_connect); + void dccp_send_ack(struct sock *sk) { /* If we have been reset, we may not send again. */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 8a6b2a9e4581..65b11ea90d85 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -24,7 +24,7 @@ #include <net/checksum.h> #include <net/inet_common.h> -#include <net/ip.h> +#include <net/inet_sock.h> #include <net/protocol.h> #include <net/sock.h> #include <net/xfrm.h> @@ -34,15 +34,18 @@ #include <linux/timer.h> #include <linux/delay.h> #include <linux/poll.h> -#include <linux/dccp.h> #include "ccid.h" #include "dccp.h" DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; +EXPORT_SYMBOL_GPL(dccp_statistics); + atomic_t dccp_orphan_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(dccp_orphan_count); + static struct net_protocol dccp_protocol = { .handler = dccp_v4_rcv, .err_handler = dccp_v4_err, @@ -149,6 +152,8 @@ int dccp_disconnect(struct sock *sk, int flags) return err; } +EXPORT_SYMBOL_GPL(dccp_disconnect); + /* * Wait for a DCCP event. * @@ -156,8 +161,8 @@ int dccp_disconnect(struct sock *sk, int flags) * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -static unsigned int dccp_poll(struct file *file, struct socket *sock, - poll_table *wait) +unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait) { unsigned int mask; struct sock *sk = sock->sk; @@ -205,12 +210,16 @@ static unsigned int dccp_poll(struct file *file, struct socket *sock, return mask; } +EXPORT_SYMBOL_GPL(dccp_poll); + int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) { dccp_pr_debug("entry\n"); return -ENOIOCTLCMD; } +EXPORT_SYMBOL_GPL(dccp_ioctl); + static int dccp_setsockopt_service(struct sock *sk, const u32 service, char __user *optval, int optlen) { @@ -254,7 +263,9 @@ int dccp_setsockopt(struct sock *sk, int level, int optname, int val; if (level != SOL_DCCP) - return ip_setsockopt(sk, level, optname, optval, optlen); + return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, + optname, optval, + optlen); if (optlen < sizeof(int)) return -EINVAL; @@ -282,6 +293,8 @@ int dccp_setsockopt(struct sock *sk, int level, int optname, return err; } +EXPORT_SYMBOL_GPL(dccp_setsockopt); + static int dccp_getsockopt_service(struct sock *sk, int len, u32 __user *optval, int __user *optlen) @@ -320,8 +333,9 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, int val, len; if (level != SOL_DCCP) - return ip_getsockopt(sk, level, optname, optval, optlen); - + return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, + optname, optval, + optlen); if (get_user(len, optlen)) return -EFAULT; @@ -354,6 +368,8 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, return 0; } +EXPORT_SYMBOL_GPL(dccp_getsockopt); + int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -410,6 +426,8 @@ out_discard: goto out_release; } +EXPORT_SYMBOL_GPL(dccp_sendmsg); + int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -507,7 +525,9 @@ out: return len; } -static int inet_dccp_listen(struct socket *sock, int backlog) +EXPORT_SYMBOL_GPL(dccp_recvmsg); + +int inet_dccp_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; @@ -543,6 +563,8 @@ out: return err; } +EXPORT_SYMBOL_GPL(inet_dccp_listen); + static const unsigned char dccp_new_state[] = { /* current state: new state: action: */ [0] = DCCP_CLOSED, @@ -648,12 +670,16 @@ adjudge_to_death: sock_put(sk); } +EXPORT_SYMBOL_GPL(dccp_close); + void dccp_shutdown(struct sock *sk, int how) { dccp_pr_debug("entry\n"); } -static struct proto_ops inet_dccp_ops = { +EXPORT_SYMBOL_GPL(dccp_shutdown); + +static const struct proto_ops inet_dccp_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -681,11 +707,11 @@ extern struct net_proto_family inet_family_ops; static struct inet_protosw dccp_v4_protosw = { .type = SOCK_DCCP, .protocol = IPPROTO_DCCP, - .prot = &dccp_v4_prot, + .prot = &dccp_prot, .ops = &inet_dccp_ops, .capability = -1, .no_check = 0, - .flags = 0, + .flags = INET_PROTOSW_ICSK, }; /* @@ -760,13 +786,15 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); int dccp_debug; module_param(dccp_debug, int, 0444); MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); + +EXPORT_SYMBOL_GPL(dccp_debug); #endif static int __init dccp_init(void) { unsigned long goal; int ehash_order, bhash_order, i; - int rc = proto_register(&dccp_v4_prot, 1); + int rc = proto_register(&dccp_prot, 1); if (rc) goto out; @@ -869,7 +897,7 @@ out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_hashinfo.bind_bucket_cachep = NULL; out_proto_unregister: - proto_unregister(&dccp_v4_prot); + proto_unregister(&dccp_prot); goto out; } @@ -892,7 +920,7 @@ static void __exit dccp_fini(void) get_order(dccp_hashinfo.ehash_size * sizeof(struct inet_ehash_bucket))); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); - proto_unregister(&dccp_v4_prot); + proto_unregister(&dccp_prot); } module_init(dccp_init); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index d402e9020c68..ce4aaf94860d 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -122,6 +122,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat #include <net/flow.h> #include <asm/system.h> #include <asm/ioctls.h> +#include <linux/capability.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> @@ -149,7 +150,7 @@ static void dn_keepalive(struct sock *sk); #define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1) -static struct proto_ops dn_proto_ops; +static const struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; @@ -1252,7 +1253,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; default: - err = dev_ioctl(cmd, (void __user *)arg); + err = -ENOIOCTLCMD; break; } @@ -2342,7 +2343,7 @@ static struct net_proto_family dn_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops dn_proto_ops = { +static const struct proto_ops dn_proto_ops = { .family = AF_DECnet, .owner = THIS_MODULE, .release = dn_release, diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 5610bb16dbf9..efbead83ba7f 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -25,6 +25,7 @@ */ #include <linux/config.h> +#include <linux/capability.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/init.h> diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 8d0cc3cf3e49..33ab256cfd4a 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -408,11 +408,14 @@ int dn_neigh_router_hello(struct sk_buff *skb) } } - if (!dn_db->router) { - dn_db->router = neigh_clone(neigh); - } else { - if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) - neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); + /* Only use routers in our area */ + if ((dn_ntohs(src)>>10) == dn_ntohs((decnet_address)>>10)) { + if (!dn_db->router) { + dn_db->router = neigh_clone(neigh); + } else { + if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) + neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); + } } write_unlock(&neigh->lock); neigh_release(neigh); diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 369f25b60f3f..44bda85e678f 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -793,7 +793,6 @@ static int dn_nsp_rx_packet(struct sk_buff *skb) got_it: if (sk != NULL) { struct dn_scp *scp = DN_SK(sk); - int ret; /* Reset backoff */ scp->nsp_rxtshift = 0; @@ -807,21 +806,7 @@ got_it: goto free_out; } - bh_lock_sock(sk); - ret = NET_RX_SUCCESS; - if (decnet_debug_level & 8) - printk(KERN_DEBUG "NSP: 0x%02x 0x%02x 0x%04x 0x%04x %d\n", - (int)cb->rt_flags, (int)cb->nsp_flags, - (int)cb->src_port, (int)cb->dst_port, - !!sock_owned_by_user(sk)); - if (!sock_owned_by_user(sk)) - ret = dn_nsp_backlog_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - sock_put(sk); - - return ret; + return sk_receive_skb(sk, skb); } return dn_nsp_no_socket(skb, reason); diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 1ab94c6e22ed..16a5a31e2126 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -26,8 +26,6 @@ #include <net/dn.h> #include <net/dn_route.h> -#include <linux/netfilter_decnet.h> - static struct sock *dnrmg = NULL; diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 34fdac51df96..c792994d7952 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -31,6 +31,7 @@ #include <linux/if_arp.h> #include <linux/wireless.h> #include <linux/skbuff.h> +#include <linux/udp.h> #include <net/sock.h> #include <net/inet_common.h> #include <linux/stat.h> @@ -45,7 +46,7 @@ #include <asm/uaccess.h> #include <asm/system.h> -static struct proto_ops econet_ops; +static const struct proto_ops econet_ops; static struct hlist_head econet_sklist; static DEFINE_RWLOCK(econet_lock); @@ -56,7 +57,7 @@ static struct net_device *net2dev_map[256]; #define EC_PORT_IP 0xd2 #ifdef CONFIG_ECONET_AUNUDP -static spinlock_t aun_queue_lock; +static DEFINE_SPINLOCK(aun_queue_lock); static struct socket *udpsock; #define AUN_PORT 0x8000 @@ -686,7 +687,7 @@ static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg break; default: - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } /*NOTREACHED*/ return 0; @@ -698,7 +699,7 @@ static struct net_proto_family econet_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { .family = PF_ECONET, .owner = THIS_MODULE, .release = econet_release, diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index e24577367274..9890fd97e538 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -53,6 +53,7 @@ #include <linux/errno.h> #include <linux/config.h> #include <linux/init.h> +#include <linux/if_ether.h> #include <net/dst.h> #include <net/arp.h> #include <net/sock.h> @@ -162,7 +163,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb_pull(skb,ETH_HLEN); eth = eth_hdr(skb); - if (*eth->h_dest&1) { + if (is_multicast_ether_addr(eth->h_dest)) { if (!compare_ether_addr(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else @@ -251,7 +252,7 @@ static int eth_mac_addr(struct net_device *dev, void *p) static int eth_change_mtu(struct net_device *dev, int new_mtu) { - if ((new_mtu < 68) || (new_mtu > 1500)) + if (new_mtu < 68 || new_mtu > ETH_DATA_LEN) return -EINVAL; dev->mtu = new_mtu; return 0; @@ -272,7 +273,7 @@ void ether_setup(struct net_device *dev) dev->type = ARPHRD_ETHER; dev->hard_header_len = ETH_HLEN; - dev->mtu = 1500; /* eth_mtu */ + dev->mtu = ETH_DATA_LEN; dev->addr_len = ETH_ALEN; dev->tx_queue_len = 1000; /* Ethernet wants good queues */ dev->flags = IFF_BROADCAST|IFF_MULTICAST; diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c index 073aebdf0f67..f8dca31be5dd 100644 --- a/net/ieee80211/ieee80211_crypt_wep.c +++ b/net/ieee80211/ieee80211_crypt_wep.c @@ -75,22 +75,14 @@ static void prism2_wep_deinit(void *priv) kfree(priv); } -/* Perform WEP encryption on given skb that has at least 4 bytes of headroom - * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted, - * so the payload length increases with 8 bytes. - * - * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) - */ -static int prism2_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +/* Add WEP IV/key info to a frame that has at least 4 bytes of headroom */ +static int prism2_wep_build_iv(struct sk_buff *skb, int hdr_len, void *priv) { struct prism2_wep_data *wep = priv; - u32 crc, klen, len; - u8 key[WEP_KEY_LEN + 3]; - u8 *pos, *icv; - struct scatterlist sg; - - if (skb_headroom(skb) < 4 || skb_tailroom(skb) < 4 || - skb->len < hdr_len) + u32 klen, len; + u8 *pos; + + if (skb_headroom(skb) < 4 || skb->len < hdr_len) return -1; len = skb->len - hdr_len; @@ -112,15 +104,47 @@ static int prism2_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) } /* Prepend 24-bit IV to RC4 key and TX frame */ - *pos++ = key[0] = (wep->iv >> 16) & 0xff; - *pos++ = key[1] = (wep->iv >> 8) & 0xff; - *pos++ = key[2] = wep->iv & 0xff; + *pos++ = (wep->iv >> 16) & 0xff; + *pos++ = (wep->iv >> 8) & 0xff; + *pos++ = wep->iv & 0xff; *pos++ = wep->key_idx << 6; + return 0; +} + +/* Perform WEP encryption on given skb that has at least 4 bytes of headroom + * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted, + * so the payload length increases with 8 bytes. + * + * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) + */ +static int prism2_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct prism2_wep_data *wep = priv; + u32 crc, klen, len; + u8 *pos, *icv; + struct scatterlist sg; + u8 key[WEP_KEY_LEN + 3]; + + /* other checks are in prism2_wep_build_iv */ + if (skb_tailroom(skb) < 4) + return -1; + + /* add the IV to the frame */ + if (prism2_wep_build_iv(skb, hdr_len, priv)) + return -1; + + /* Copy the IV into the first 3 bytes of the key */ + memcpy(key, skb->data + hdr_len, 3); + /* Copy rest of the WEP key (the secret part) */ memcpy(key + 3, wep->key, wep->key_len); + + len = skb->len - hdr_len - 4; + pos = skb->data + hdr_len + 4; + klen = 3 + wep->key_len; - /* Append little-endian CRC32 and encrypt it to produce ICV */ + /* Append little-endian CRC32 over only the data and encrypt it to produce ICV */ crc = ~crc32_le(~0, pos, len); icv = skb_put(skb, 4); icv[0] = crc; @@ -231,6 +255,7 @@ static struct ieee80211_crypto_ops ieee80211_crypt_wep = { .name = "WEP", .init = prism2_wep_init, .deinit = prism2_wep_deinit, + .build_iv = prism2_wep_build_iv, .encrypt_mpdu = prism2_wep_encrypt, .decrypt_mpdu = prism2_wep_decrypt, .encrypt_msdu = NULL, diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c index 321287bc887f..90d18b72da3d 100644 --- a/net/ieee80211/ieee80211_module.c +++ b/net/ieee80211/ieee80211_module.c @@ -62,7 +62,7 @@ MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR(DRV_COPYRIGHT); MODULE_LICENSE("GPL"); -static inline int ieee80211_networks_allocate(struct ieee80211_device *ieee) +static int ieee80211_networks_allocate(struct ieee80211_device *ieee) { if (ieee->networks) return 0; @@ -90,7 +90,7 @@ static inline void ieee80211_networks_free(struct ieee80211_device *ieee) ieee->networks = NULL; } -static inline void ieee80211_networks_initialize(struct ieee80211_device *ieee) +static void ieee80211_networks_initialize(struct ieee80211_device *ieee) { int i; diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c index 03efaacbdb73..7a121802faa9 100644 --- a/net/ieee80211/ieee80211_rx.c +++ b/net/ieee80211/ieee80211_rx.c @@ -35,7 +35,7 @@ #include <net/ieee80211.h> -static inline void ieee80211_monitor_rx(struct ieee80211_device *ieee, +static void ieee80211_monitor_rx(struct ieee80211_device *ieee, struct sk_buff *skb, struct ieee80211_rx_stats *rx_stats) { @@ -76,8 +76,8 @@ static struct ieee80211_frag_entry *ieee80211_frag_cache_find(struct if (entry->skb != NULL && entry->seq == seq && (entry->last_frag + 1 == frag || frag == -1) && - memcmp(entry->src_addr, src, ETH_ALEN) == 0 && - memcmp(entry->dst_addr, dst, ETH_ALEN) == 0) + !compare_ether_addr(entry->src_addr, src) && + !compare_ether_addr(entry->dst_addr, dst)) return entry; } @@ -165,7 +165,7 @@ static int ieee80211_frag_cache_invalidate(struct ieee80211_device *ieee, * Responsible for handling management control frames * * Called by ieee80211_rx */ -static inline int +static int ieee80211_rx_frame_mgmt(struct ieee80211_device *ieee, struct sk_buff *skb, struct ieee80211_rx_stats *rx_stats, u16 type, u16 stype) @@ -243,12 +243,12 @@ static int ieee80211_is_eapol_frame(struct ieee80211_device *ieee, /* check that the frame is unicast frame to us */ if ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) == IEEE80211_FCTL_TODS && - memcmp(hdr->addr1, dev->dev_addr, ETH_ALEN) == 0 && - memcmp(hdr->addr3, dev->dev_addr, ETH_ALEN) == 0) { + !compare_ether_addr(hdr->addr1, dev->dev_addr) && + !compare_ether_addr(hdr->addr3, dev->dev_addr)) { /* ToDS frame with own addr BSSID and DA */ } else if ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) == IEEE80211_FCTL_FROMDS && - memcmp(hdr->addr1, dev->dev_addr, ETH_ALEN) == 0) { + !compare_ether_addr(hdr->addr1, dev->dev_addr)) { /* FromDS frame with own addr as DA */ } else return 0; @@ -266,7 +266,7 @@ static int ieee80211_is_eapol_frame(struct ieee80211_device *ieee, } /* Called only as a tasklet (software IRQ), by ieee80211_rx */ -static inline int +static int ieee80211_rx_frame_decrypt(struct ieee80211_device *ieee, struct sk_buff *skb, struct ieee80211_crypt_data *crypt) { @@ -297,7 +297,7 @@ ieee80211_rx_frame_decrypt(struct ieee80211_device *ieee, struct sk_buff *skb, } /* Called only as a tasklet (software IRQ), by ieee80211_rx */ -static inline int +static int ieee80211_rx_frame_decrypt_msdu(struct ieee80211_device *ieee, struct sk_buff *skb, int keyidx, struct ieee80211_crypt_data *crypt) @@ -410,9 +410,8 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, return 1; } - if ((is_multicast_ether_addr(hdr->addr1) || - is_broadcast_ether_addr(hdr->addr2)) ? ieee->host_mc_decrypt : - ieee->host_decrypt) { + if (is_multicast_ether_addr(hdr->addr1) + ? ieee->host_mc_decrypt : ieee->host_decrypt) { int idx = 0; if (skb->len >= hdrlen + 3) idx = skb->data[hdrlen + 3] >> 6; @@ -506,7 +505,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, if (ieee->iw_mode == IW_MODE_MASTER && !wds && (fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) == IEEE80211_FCTL_FROMDS && ieee->stadev - && memcmp(hdr->addr2, ieee->assoc_ap_addr, ETH_ALEN) == 0) { + && !compare_ether_addr(hdr->addr2, ieee->assoc_ap_addr)) { /* Frame from BSSID of the AP for which we are a client */ skb->dev = dev = ieee->stadev; stats = hostap_get_stats(dev); @@ -1157,7 +1156,7 @@ static int ieee80211_handle_assoc_resp(struct ieee80211_device *ieee, struct iee /***************************************************/ -static inline int ieee80211_network_init(struct ieee80211_device *ieee, struct ieee80211_probe_response +static int ieee80211_network_init(struct ieee80211_device *ieee, struct ieee80211_probe_response *beacon, struct ieee80211_network *network, struct ieee80211_rx_stats *stats) @@ -1232,11 +1231,11 @@ static inline int is_same_network(struct ieee80211_network *src, * as one network */ return ((src->ssid_len == dst->ssid_len) && (src->channel == dst->channel) && - !memcmp(src->bssid, dst->bssid, ETH_ALEN) && + !compare_ether_addr(src->bssid, dst->bssid) && !memcmp(src->ssid, dst->ssid, src->ssid_len)); } -static inline void update_network(struct ieee80211_network *dst, +static void update_network(struct ieee80211_network *dst, struct ieee80211_network *src) { int qos_active; @@ -1295,7 +1294,7 @@ static inline int is_beacon(int fc) return (WLAN_FC_GET_STYPE(le16_to_cpu(fc)) == IEEE80211_STYPE_BEACON); } -static inline void ieee80211_process_probe_response(struct ieee80211_device +static void ieee80211_process_probe_response(struct ieee80211_device *ieee, struct ieee80211_probe_response *beacon, struct ieee80211_rx_stats diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c index 445f206e65e0..8fdd943ebe8e 100644 --- a/net/ieee80211/ieee80211_tx.c +++ b/net/ieee80211/ieee80211_tx.c @@ -127,7 +127,7 @@ payload of each frame is reduced to 492 bytes. static u8 P802_1H_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0xf8 }; static u8 RFC1042_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0x00 }; -static inline int ieee80211_copy_snap(u8 * data, u16 h_proto) +static int ieee80211_copy_snap(u8 * data, u16 h_proto) { struct ieee80211_snap_hdr *snap; u8 *oui; @@ -150,7 +150,7 @@ static inline int ieee80211_copy_snap(u8 * data, u16 h_proto) return SNAP_SIZE + sizeof(u16); } -static inline int ieee80211_encrypt_fragment(struct ieee80211_device *ieee, +static int ieee80211_encrypt_fragment(struct ieee80211_device *ieee, struct sk_buff *frag, int hdr_len) { struct ieee80211_crypt_data *crypt = ieee->crypt[ieee->tx_keyidx]; @@ -288,7 +288,7 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev) /* Determine total amount of storage required for TXB packets */ bytes = skb->len + SNAP_SIZE + sizeof(u16); - if (host_encrypt) + if (host_encrypt || host_build_iv) fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA | IEEE80211_FCTL_PROTECTED; else diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c index 181755f2aa8b..23e1630f50b7 100644 --- a/net/ieee80211/ieee80211_wx.c +++ b/net/ieee80211/ieee80211_wx.c @@ -42,7 +42,7 @@ static const char *ieee80211_modes[] = { }; #define MAX_CUSTOM_LEN 64 -static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee, +static char *ipw2100_translate_scan(struct ieee80211_device *ieee, char *start, char *stop, struct ieee80211_network *network) { @@ -284,7 +284,7 @@ int ieee80211_wx_set_encode(struct ieee80211_device *ieee, }; int i, key, key_provided, len; struct ieee80211_crypt_data **crypt; - int host_crypto = ieee->host_encrypt || ieee->host_decrypt; + int host_crypto = ieee->host_encrypt || ieee->host_decrypt || ieee->host_build_iv; IEEE80211_DEBUG_WX("SET_ENCODE\n"); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e55136ae09f4..011cca7ae02b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -456,6 +456,14 @@ config TCP_CONG_BIC increase provides TCP friendliness. See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ +config TCP_CONG_CUBIC + tristate "CUBIC TCP" + default m + ---help--- + This is version 2.0 of BIC-TCP which uses a cubic growth function + among other techniques. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + config TCP_CONG_WESTWOOD tristate "TCP Westwood+" default m diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f0435d00db6b..35e5f5999092 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -9,7 +9,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ - sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o @@ -28,12 +28,13 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o -obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d368cf249000..97c276f95b35 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -79,6 +79,7 @@ #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> +#include <linux/capability.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> @@ -93,6 +94,7 @@ #include <linux/smp_lock.h> #include <linux/inet.h> #include <linux/igmp.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> @@ -302,6 +304,7 @@ lookup_protocol: sk->sk_reuse = 1; inet = inet_sk(sk); + inet->is_icsk = INET_PROTOSW_ICSK & answer_flags; if (SOCK_RAW == sock->type) { inet->num = protocol; @@ -775,16 +778,16 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = devinet_ioctl(cmd, (void __user *)arg); break; default: - if (!sk->sk_prot->ioctl || - (err = sk->sk_prot->ioctl(sk, cmd, arg)) == - -ENOIOCTLCMD) - err = dev_ioctl(cmd, (void __user *)arg); + if (sk->sk_prot->ioctl) + err = sk->sk_prot->ioctl(sk, cmd, arg); + else + err = -ENOIOCTLCMD; break; } return err; } -struct proto_ops inet_stream_ops = { +const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -805,7 +808,7 @@ struct proto_ops inet_stream_ops = { .sendpage = tcp_sendpage }; -struct proto_ops inet_dgram_ops = { +const struct proto_ops inet_dgram_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -830,7 +833,7 @@ struct proto_ops inet_dgram_ops = { * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without * udp_poll */ -static struct proto_ops inet_sockraw_ops = { +static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -869,7 +872,8 @@ static struct inet_protosw inetsw_array[] = .ops = &inet_stream_ops, .capability = -1, .no_check = 0, - .flags = INET_PROTOSW_PERMANENT, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, }, { diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 035ad2c9e1ba..aed537fa2c88 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -6,6 +6,7 @@ #include <linux/crypto.h> #include <linux/pfkeyv2.h> #include <net/icmp.h> +#include <net/protocol.h> #include <asm/scatterlist.h> diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index b425748f02d7..accdefedfed7 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -79,6 +79,7 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/capability.h> #include <linux/config.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -86,6 +87,7 @@ #include <linux/in.h> #include <linux/mm.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/fddidevice.h> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 04a6fe3e95a2..95b9d81ac488 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -32,6 +32,7 @@ #include <asm/uaccess.h> #include <asm/system.h> #include <linux/bitops.h> +#include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> @@ -58,6 +59,7 @@ #endif #include <linux/kmod.h> +#include <net/arp.h> #include <net/ip.h> #include <net/route.h> #include <net/ip_fib.h> diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 1b18ce66e7b7..73bfcae8af9c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -9,6 +9,7 @@ #include <linux/pfkeyv2.h> #include <linux/random.h> #include <net/icmp.h> +#include <net/protocol.h> #include <net/udp.h> /* decapsulation data for use when post-processing */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 19b1b984d687..4e3d3811dea2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -20,6 +20,7 @@ #include <asm/uaccess.h> #include <asm/system.h> #include <linux/bitops.h> +#include <linux/capability.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -30,6 +31,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> @@ -287,13 +289,13 @@ static int inet_check_attr(struct rtmsg *r, struct rtattr **rta) { int i; - for (i=1; i<=RTA_MAX; i++) { - struct rtattr *attr = rta[i-1]; + for (i=1; i<=RTA_MAX; i++, rta++) { + struct rtattr *attr = *rta; if (attr) { if (RTA_PAYLOAD(attr) < 4) return -EINVAL; if (i != RTA_MULTIPATH && i != RTA_METRICS) - rta[i-1] = (struct rtattr*)RTA_DATA(attr); + *rta = (struct rtattr*)RTA_DATA(attr); } } return 0; diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 7ea0209cb169..e2890ec8159e 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -29,6 +29,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 0b298bbc1518..0dd4d06e456d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -33,6 +33,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6d2a6ac070e3..ef4724de7350 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -29,6 +29,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> @@ -36,6 +37,7 @@ #include <linux/netlink.h> #include <linux/init.h> +#include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 705e3ce86df9..e320b32373e5 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -41,6 +41,13 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. + * + * Substantial contributions to this work comes from: + * + * David S. Miller, <davem@davemloft.net> + * Stephen Hemminger <shemminger@osdl.org> + * Paul E. McKenney <paulmck@us.ibm.com> + * Patrick McHardy <kaber@trash.net> */ #define VERSION "0.404" @@ -59,6 +66,7 @@ #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 92e23b2ad4d2..105039eb7629 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -73,6 +73,7 @@ #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/string.h> #include <linux/netfilter_ipv4.h> @@ -898,8 +899,7 @@ static void icmp_address_reply(struct sk_buff *skb) u32 _mask, *mp; mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask); - if (mp == NULL) - BUG(); + BUG_ON(mp == NULL); for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { if (*mp == ifa->ifa_mask && inet_ifa_match(rt->rt_src, ifa)) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4a195c724f01..192092b89e53 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -91,6 +91,8 @@ #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> + +#include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> @@ -973,7 +975,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ - pmc = (struct ip_mc_list *)kmalloc(sizeof(*pmc), GFP_KERNEL); + pmc = kmalloc(sizeof(*pmc), GFP_KERNEL); if (!pmc) return; memset(pmc, 0, sizeof(*pmc)); @@ -1153,7 +1155,7 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr) } } - im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL); + im = kmalloc(sizeof(*im), GFP_KERNEL); if (!im) goto out; @@ -1474,7 +1476,7 @@ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, psf_prev = psf; } if (!psf) { - psf = (struct ip_sf_list *)kmalloc(sizeof(*psf), GFP_ATOMIC); + psf = kmalloc(sizeof(*psf), GFP_ATOMIC); if (!psf) return -ENOBUFS; memset(psf, 0, sizeof(*psf)); @@ -1657,7 +1659,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) err = -ENOBUFS; if (count >= sysctl_igmp_max_memberships) goto done; - iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL); + iml = sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL); if (iml == NULL) goto done; @@ -1821,8 +1823,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (psl) count += psl->sl_max; - newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, - IP_SFLSIZE(count), GFP_KERNEL); + newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; @@ -1905,8 +1906,8 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) goto done; } if (msf->imsf_numsrc) { - newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, - IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL); + newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc), + GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3fe021f1a566..ae20281d8deb 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); */ int sysctl_local_port_range[2] = { 1024, 4999 }; -static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) +int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) { const u32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; @@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke return node != NULL; } +EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); + /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */ int inet_csk_get_port(struct inet_hashinfo *hashinfo, - struct sock *sk, unsigned short snum) + struct sock *sk, unsigned short snum, + int (*bind_conflict)(const struct sock *sk, + const struct inet_bind_bucket *tb)) { struct inet_bind_hashbucket *head; struct hlist_node *node; @@ -125,7 +130,7 @@ tb_found: goto success; } else { ret = 1; - if (inet_csk_bind_conflict(sk, tb)) + if (bind_conflict(sk, tb)) goto fail_unlock; } } @@ -380,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, EXPORT_SYMBOL_GPL(inet_csk_search_req); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - const unsigned timeout) + unsigned long timeout) { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; @@ -631,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk) } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); + +void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + const struct inet_sock *inet = inet_sk(sk); + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = inet->daddr; + sin->sin_port = inet->dport; +} + +EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 39061ed53cfd..457db99c76df 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -50,9 +50,10 @@ static struct sock *idiagnl; #define INET_DIAG_PUT(skb, attrtype, attrlen) \ RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) -static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, - int ext, u32 pid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) +static int inet_csk_diag_fill(struct sock *sk, + struct sk_buff *skb, + int ext, u32 pid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -70,20 +71,22 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, nlh->nlmsg_flags = nlmsg_flags; r = NLMSG_DATA(nlh); - if (sk->sk_state != TCP_TIME_WAIT) { - if (ext & (1 << (INET_DIAG_MEMINFO - 1))) - minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, - sizeof(*minfo)); - if (ext & (1 << (INET_DIAG_INFO - 1))) - info = INET_DIAG_PUT(skb, INET_DIAG_INFO, - handler->idiag_info_size); - - if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { - size_t len = strlen(icsk->icsk_ca_ops->name); - strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), - icsk->icsk_ca_ops->name); - } + BUG_ON(sk->sk_state == TCP_TIME_WAIT); + + if (ext & (1 << (INET_DIAG_MEMINFO - 1))) + minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo)); + + if (ext & (1 << (INET_DIAG_INFO - 1))) + info = INET_DIAG_PUT(skb, INET_DIAG_INFO, + handler->idiag_info_size); + + if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { + const size_t len = strlen(icsk->icsk_ca_ops->name); + + strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), + icsk->icsk_ca_ops->name); } + r->idiag_family = sk->sk_family; r->idiag_state = sk->sk_state; r->idiag_timer = 0; @@ -93,37 +96,6 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, r->id.idiag_cookie[0] = (u32)(unsigned long)sk; r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); - if (r->idiag_state == TCP_TIME_WAIT) { - const struct inet_timewait_sock *tw = inet_twsk(sk); - long tmo = tw->tw_ttd - jiffies; - if (tmo < 0) - tmo = 0; - - r->id.idiag_sport = tw->tw_sport; - r->id.idiag_dport = tw->tw_dport; - r->id.idiag_src[0] = tw->tw_rcv_saddr; - r->id.idiag_dst[0] = tw->tw_daddr; - r->idiag_state = tw->tw_substate; - r->idiag_timer = 3; - r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->idiag_family == AF_INET6) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); - - ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6tw->tw_v6_rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6tw->tw_v6_daddr); - } -#endif - nlh->nlmsg_len = skb->tail - b; - return skb->len; - } - r->id.idiag_sport = inet->sport; r->id.idiag_dport = inet->dport; r->id.idiag_src[0] = inet->rcv_saddr; @@ -185,7 +157,75 @@ nlmsg_failure: return -1; } -static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) +static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, + struct sk_buff *skb, int ext, u32 pid, + u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + long tmo; + struct inet_diag_msg *r; + const unsigned char *previous_tail = skb->tail; + struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, + unlh->nlmsg_type, sizeof(*r)); + + r = NLMSG_DATA(nlh); + BUG_ON(tw->tw_state != TCP_TIME_WAIT); + + nlh->nlmsg_flags = nlmsg_flags; + + tmo = tw->tw_ttd - jiffies; + if (tmo < 0) + tmo = 0; + + r->idiag_family = tw->tw_family; + r->idiag_state = tw->tw_state; + r->idiag_timer = 0; + r->idiag_retrans = 0; + r->id.idiag_if = tw->tw_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)tw; + r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1); + r->id.idiag_sport = tw->tw_sport; + r->id.idiag_dport = tw->tw_dport; + r->id.idiag_src[0] = tw->tw_rcv_saddr; + r->id.idiag_dst[0] = tw->tw_daddr; + r->idiag_state = tw->tw_substate; + r->idiag_timer = 3; + r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (tw->tw_family == AF_INET6) { + const struct inet6_timewait_sock *tw6 = + inet6_twsk((struct sock *)tw); + + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &tw6->tw_v6_rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &tw6->tw_v6_daddr); + } +#endif + nlh->nlmsg_len = skb->tail - previous_tail; + return skb->len; +nlmsg_failure: + skb_trim(skb, previous_tail - skb->data); + return -1; +} + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + int ext, u32 pid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, + skb, ext, pid, seq, nlmsg_flags, + unlh); + return inet_csk_diag_fill(sk, skb, ext, pid, seq, nlmsg_flags, unlh); +} + +static int inet_diag_get_exact(struct sk_buff *in_skb, + const struct nlmsghdr *nlh) { int err; struct sock *sk; @@ -235,7 +275,7 @@ static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nl if (!rep) goto out; - if (inet_diag_fill(rep, sk, req->idiag_ext, + if (sk_diag_fill(sk, rep, req->idiag_ext, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 0, nlh) <= 0) BUG(); @@ -283,7 +323,7 @@ static int bitstring_match(const u32 *a1, const u32 *a2, int bits) static int inet_diag_bc_run(const void *bc, int len, - const struct inet_diag_entry *entry) + const struct inet_diag_entry *entry) { while (len > 0) { int yes = 1; @@ -322,7 +362,7 @@ static int inet_diag_bc_run(const void *bc, int len, yes = 0; break; } - + if (cond->prefix_len == 0) break; @@ -331,7 +371,8 @@ static int inet_diag_bc_run(const void *bc, int len, else addr = entry->daddr; - if (bitstring_match(addr, cond->addr, cond->prefix_len)) + if (bitstring_match(addr, cond->addr, + cond->prefix_len)) break; if (entry->family == AF_INET6 && cond->family == AF_INET) { @@ -346,7 +387,7 @@ static int inet_diag_bc_run(const void *bc, int len, } } - if (yes) { + if (yes) { len -= op->yes; bc += op->yes; } else { @@ -407,14 +448,15 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) default: return -EINVAL; } - bc += op->yes; + bc += op->yes; len -= op->yes; } return len == 0 ? 0 : -EINVAL; } -static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb) +static int inet_csk_diag_dump(struct sock *sk, + struct sk_buff *skb, + struct netlink_callback *cb) { struct inet_diag_req *r = NLMSG_DATA(cb->nlh); @@ -444,14 +486,50 @@ static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk, return 0; } - return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); + return inet_csk_diag_fill(sk, skb, r->idiag_ext, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); +} + +static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + + if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + struct inet_diag_entry entry; + struct rtattr *bc = (struct rtattr *)(r + 1); + + entry.family = tw->tw_family; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (tw->tw_family == AF_INET6) { + struct inet6_timewait_sock *tw6 = + inet6_twsk((struct sock *)tw); + entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32; + entry.daddr = tw6->tw_v6_daddr.s6_addr32; + } else +#endif + { + entry.saddr = &tw->tw_rcv_saddr; + entry.daddr = &tw->tw_daddr; + } + entry.sport = tw->tw_num; + entry.dport = ntohs(tw->tw_dport); + entry.userlocks = 0; + + if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + return 0; + } + + return inet_twsk_diag_fill(tw, skb, r->idiag_ext, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, - struct request_sock *req, - u32 pid, u32 seq, - const struct nlmsghdr *unlh) + struct request_sock *req, u32 pid, u32 seq, + const struct nlmsghdr *unlh) { const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *inet = inet_sk(sk); @@ -489,9 +567,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6_rsk(req)->loc_addr); + &inet6_rsk(req)->loc_addr); ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6_rsk(req)->rmt_addr); + &inet6_rsk(req)->rmt_addr); } #endif nlh->nlmsg_len = skb->tail - b; @@ -504,7 +582,7 @@ nlmsg_failure: } static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb) + struct netlink_callback *cb) { struct inet_diag_entry entry; struct inet_diag_req *r = NLMSG_DATA(cb->nlh); @@ -553,13 +631,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.saddr = #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? - tcp6_rsk(req)->loc_addr.s6_addr32 : + inet6_rsk(req)->loc_addr.s6_addr32 : #endif &ireq->loc_addr; - entry.daddr = + entry.daddr = #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? - tcp6_rsk(req)->rmt_addr.s6_addr32 : + inet6_rsk(req)->rmt_addr.s6_addr32 : #endif &ireq->rmt_addr; entry.dport = ntohs(ireq->rmt_port); @@ -599,7 +677,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) handler = inet_diag_table[cb->nlh->nlmsg_type]; BUG_ON(handler == NULL); hashinfo = handler->idiag_hashinfo; - + s_i = cb->args[1]; s_num = num = cb->args[2]; @@ -630,7 +708,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[3] > 0) goto syn_recv; - if (inet_diag_dump_sock(skb, sk, cb) < 0) { + if (inet_csk_diag_dump(sk, skb, cb) < 0) { inet_listen_unlock(hashinfo); goto done; } @@ -672,7 +750,6 @@ skip_listen_ht: s_num = 0; read_lock_bh(&head->lock); - num = 0; sk_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); @@ -684,9 +761,10 @@ skip_listen_ht: if (r->id.idiag_sport != inet->sport && r->id.idiag_sport) goto next_normal; - if (r->id.idiag_dport != inet->dport && r->id.idiag_dport) + if (r->id.idiag_dport != inet->dport && + r->id.idiag_dport) goto next_normal; - if (inet_diag_dump_sock(skb, sk, cb) < 0) { + if (inet_csk_diag_dump(sk, skb, cb) < 0) { read_unlock_bh(&head->lock); goto done; } @@ -695,19 +773,20 @@ next_normal: } if (r->idiag_states & TCPF_TIME_WAIT) { - sk_for_each(sk, node, + struct inet_timewait_sock *tw; + + inet_twsk_for_each(tw, node, &hashinfo->ehash[i + hashinfo->ehash_size].chain) { - struct inet_sock *inet = inet_sk(sk); if (num < s_num) goto next_dying; - if (r->id.idiag_sport != inet->sport && + if (r->id.idiag_sport != tw->tw_sport && r->id.idiag_sport) goto next_dying; - if (r->id.idiag_dport != inet->dport && + if (r->id.idiag_dport != tw->tw_dport && r->id.idiag_dport) goto next_dying; - if (inet_diag_dump_sock(skb, sk, cb) < 0) { + if (inet_twsk_diag_dump(tw, skb, cb) < 0) { read_unlock_bh(&head->lock); goto done; } @@ -724,8 +803,7 @@ done: return skb->len; } -static __inline__ int -inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static inline int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) return 0; @@ -755,9 +833,8 @@ inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } return netlink_dump_start(idiagnl, skb, nlh, inet_diag_dump, NULL); - } else { + } else return inet_diag_get_exact(skb, nlh); - } err_inval: return -EINVAL; @@ -766,15 +843,15 @@ err_inval: static inline void inet_diag_rcv_skb(struct sk_buff *skb) { - int err; - struct nlmsghdr * nlh; - if (skb->len >= NLMSG_SPACE(0)) { - nlh = (struct nlmsghdr *)skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + int err; + struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; + + if (nlh->nlmsg_len < sizeof(*nlh) || + skb->len < nlh->nlmsg_len) return; err = inet_diag_rcv_msg(skb, nlh); - if (err || nlh->nlmsg_flags & NLM_F_ACK) + if (err || nlh->nlmsg_flags & NLM_F_ACK) netlink_ack(skb, nlh, err); } } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e8d29fe736d2..33228115cda4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -15,12 +15,14 @@ #include <linux/config.h> #include <linux/module.h> +#include <linux/random.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/wait.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> +#include <net/ip.h> /* * Allocate and initialize a new local port bind bucket. @@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad } EXPORT_SYMBOL_GPL(__inet_lookup_listener); + +/* called with local bh disabled */ +static int __inet_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); + u32 daddr = inet->rcv_saddr; + u32 saddr = inet->daddr; + int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + prefetch(head->chain.first); + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { + tw = inet_twsk(sk2); + + if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->num = lport; + inet->sport = htons(lport); + sk->sk_hash = hash; + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, + inet->dport); +} + +/* + * Bind a port for a connect operation and hash it. + */ +int inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int range = high - low; + int i; + int port; + static u32 hint; + u32 offset = hint + inet_sk_port_offset(sk); + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__inet_check_established(death_row, + sk, port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __inet_hash(hinfo, sk, 0); + } + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, death_row);; + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + __inet_hash(hinfo, sk, 0); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __inet_check_established(death_row, sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +EXPORT_SYMBOL_GPL(inet_hash_connect); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index a010e9a68811..417f126c749e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) { - struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, - SLAB_ATOMIC); + struct inet_timewait_sock *tw = + kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + SLAB_ATOMIC); if (tw != NULL) { const struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 2fc3fd38924f..2160874ce7aa 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -304,8 +304,7 @@ static void unlink_from_pool(struct inet_peer *p) /* look for a node to insert instead of p */ struct inet_peer *t; t = lookup_rightempty(p); - if (*stackptr[-1] != t) - BUG(); + BUG_ON(*stackptr[-1] != t); **--stackptr = t->avl_left; /* t is removed, t->v4daddr > x->v4daddr for any * x in p->avl_left subtree. @@ -314,8 +313,7 @@ static void unlink_from_pool(struct inet_peer *p) t->avl_left = p->avl_left; t->avl_right = p->avl_right; t->avl_height = p->avl_height; - if (delp[1] != &p->avl_left) - BUG(); + BUG_ON(delp[1] != &p->avl_left); delp[1] = &t->avl_left; /* was &p->avl_left */ } peer_avl_rebalance(stack, stackptr); @@ -401,6 +399,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create) return NULL; n->v4daddr = daddr; atomic_set(&n->refcnt, 1); + atomic_set(&n->rid, 0); n->ip_id_count = secure_ip_id(daddr); n->tcp_ts_stamp = 0; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8ce0ce2ee48e..2a8adda15e11 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -22,6 +22,7 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ +#include <linux/compiler.h> #include <linux/config.h> #include <linux/module.h> #include <linux/types.h> @@ -38,6 +39,7 @@ #include <net/ip.h> #include <net/icmp.h> #include <net/checksum.h> +#include <net/inetpeer.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/inet.h> @@ -56,6 +58,8 @@ int sysctl_ipfrag_high_thresh = 256*1024; int sysctl_ipfrag_low_thresh = 192*1024; +int sysctl_ipfrag_max_dist = 64; + /* Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. */ @@ -89,8 +93,10 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - int iif; struct timeval stamp; + int iif; + unsigned int rid; + struct inet_peer *peer; }; /* Hash table. */ @@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work) BUG_TRAP(qp->last_in&COMPLETE); BUG_TRAP(del_timer(&qp->timer) == 0); + if (qp->peer) + inet_putpeer(qp->peer); + /* Release all fragment data. */ fp = qp->fragments; while (fp) { @@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) qp->meat = 0; qp->fragments = NULL; qp->iif = 0; + qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; /* Initialize a timer for this entry. */ init_timer(&qp->timer); @@ -373,7 +383,7 @@ out_nomem: */ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) { - __u16 id = iph->id; + __be16 id = iph->id; __u32 saddr = iph->saddr; __u32 daddr = iph->daddr; __u8 protocol = iph->protocol; @@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) return ip_frag_create(hash, iph, user); } +/* Is the fragment too far ahead to be part of ipq? */ +static inline int ip_frag_too_far(struct ipq *qp) +{ + struct inet_peer *peer = qp->peer; + unsigned int max = sysctl_ipfrag_max_dist; + unsigned int start, end; + + int rc; + + if (!peer || !max) + return 0; + + start = qp->rid; + end = atomic_inc_return(&peer->rid); + qp->rid = end; + + rc = qp->fragments && (end - start) > max; + + if (rc) { + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } + + return rc; +} + +static int ip_frag_reinit(struct ipq *qp) +{ + struct sk_buff *fp; + + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) { + atomic_inc(&qp->refcnt); + return -ETIMEDOUT; + } + + fp = qp->fragments; + do { + struct sk_buff *xp = fp->next; + frag_kfree_skb(fp, NULL); + fp = xp; + } while (fp); + + qp->last_in = 0; + qp->len = 0; + qp->meat = 0; + qp->fragments = NULL; + qp->iif = 0; + + return 0; +} + /* Add new segment to existing queue. */ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { @@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (qp->last_in & COMPLETE) goto err; + if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && + unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) { + ipq_kill(qp); + goto err; + } + offset = ntohs(skb->nh.iph->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 46f9d9cf7a5f..abe23923e4e7 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -10,6 +10,7 @@ * */ +#include <linux/capability.h> #include <linux/config.h> #include <linux/module.h> #include <linux/types.h> @@ -28,6 +29,7 @@ #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> +#include <linux/if_ether.h> #include <net/sock.h> #include <net/ip.h> @@ -187,7 +189,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key) } if (ipgre_fb_tunnel_dev->flags&IFF_UP) - return ipgre_fb_tunnel_dev->priv; + return netdev_priv(ipgre_fb_tunnel_dev); return NULL; } @@ -277,7 +279,7 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int return NULL; dev->init = ipgre_tunnel_init; - nt = dev->priv; + nt = netdev_priv(dev); nt->parms = *parms; if (register_netdevice(dev) < 0) { @@ -285,9 +287,6 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int goto failed; } - nt = dev->priv; - nt->parms = *parms; - dev_hold(dev); ipgre_tunnel_link(nt); return nt; @@ -298,7 +297,7 @@ failed: static void ipgre_tunnel_uninit(struct net_device *dev) { - ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv); + ipgre_tunnel_unlink(netdev_priv(dev)); dev_put(dev); } @@ -517,7 +516,7 @@ out: skb2->dst->ops->update_pmtu(skb2->dst, rel_info); rel_info = htonl(rel_info); } else if (type == ICMP_TIME_EXCEEDED) { - struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; + struct ip_tunnel *t = netdev_priv(skb2->dev); if (t->parms.iph.ttl) { rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; @@ -668,7 +667,7 @@ drop_nolock: static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { - struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct ip_tunnel *tunnel = netdev_priv(dev); struct net_device_stats *stats = &tunnel->stat; struct iphdr *old_iph = skb->nh.iph; struct iphdr *tiph; @@ -831,6 +830,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb->h.raw = skb->nh.raw; skb->nh.raw = skb_push(skb, gre_hlen); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED); dst_release(skb->dst); skb->dst = &rt->u.dst; @@ -913,7 +913,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) t = ipgre_tunnel_locate(&p, 0); } if (t == NULL) - t = (struct ip_tunnel*)dev->priv; + t = netdev_priv(dev); memcpy(&p, &t->parms, sizeof(p)); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; @@ -953,7 +953,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) } else { unsigned nflags=0; - t = (struct ip_tunnel*)dev->priv; + t = netdev_priv(dev); if (MULTICAST(p.iph.daddr)) nflags = IFF_BROADCAST; @@ -1002,7 +1002,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) if ((t = ipgre_tunnel_locate(&p, 0)) == NULL) goto done; err = -EPERM; - if (t == ipgre_fb_tunnel_dev->priv) + if (t == netdev_priv(ipgre_fb_tunnel_dev)) goto done; dev = t->dev; } @@ -1019,12 +1019,12 @@ done: static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev) { - return &(((struct ip_tunnel*)dev->priv)->stat); + return &(((struct ip_tunnel*)netdev_priv(dev))->stat); } static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) { - struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct ip_tunnel *tunnel = netdev_priv(dev); if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen) return -EINVAL; dev->mtu = new_mtu; @@ -1064,7 +1064,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len) { - struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + struct ip_tunnel *t = netdev_priv(dev); struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); u16 *p = (u16*)(iph+1); @@ -1091,7 +1091,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned sh static int ipgre_open(struct net_device *dev) { - struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + struct ip_tunnel *t = netdev_priv(dev); if (MULTICAST(t->parms.iph.daddr)) { struct flowi fl = { .oif = t->parms.link, @@ -1115,7 +1115,7 @@ static int ipgre_open(struct net_device *dev) static int ipgre_close(struct net_device *dev) { - struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + struct ip_tunnel *t = netdev_priv(dev); if (MULTICAST(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev = inetdev_by_index(t->mlink); if (in_dev) { @@ -1140,7 +1140,7 @@ static void ipgre_tunnel_setup(struct net_device *dev) dev->type = ARPHRD_IPGRE; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4; - dev->mtu = 1500 - sizeof(struct iphdr) - 4; + dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; @@ -1152,10 +1152,10 @@ static int ipgre_tunnel_init(struct net_device *dev) struct ip_tunnel *tunnel; struct iphdr *iph; int hlen = LL_MAX_HEADER; - int mtu = 1500; + int mtu = ETH_DATA_LEN; int addend = sizeof(struct iphdr) + 4; - tunnel = (struct ip_tunnel*)dev->priv; + tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; tunnel->dev = dev; @@ -1219,7 +1219,7 @@ static int ipgre_tunnel_init(struct net_device *dev) static int __init ipgre_fb_tunnel_init(struct net_device *dev) { - struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; tunnel->dev = dev; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 473d0f2b2e0d..18d7fad474d7 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -128,6 +128,7 @@ #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> @@ -184,7 +185,6 @@ int ip_call_ra_chain(struct sk_buff *skb) raw_rcv(last, skb2); } last = sk; - nf_reset(skb); } } @@ -203,10 +203,6 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb) __skb_pull(skb, ihl); - /* Free reference early: we don't need it any more, and it may - hold ip_conntrack module loaded indefinitely. */ - nf_reset(skb); - /* Point into the IP datagram, just past the header. */ skb->h.raw = skb->data; @@ -231,10 +227,12 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb) if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { int ret; - if (!ipprot->no_policy && - !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - kfree_skb(skb); - goto out; + if (!ipprot->no_policy) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + goto out; + } + nf_reset(skb); } ret = ipprot->handler(skb); if (ret < 0) { diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index dbe12da8d8b3..9bebad07bf2e 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -11,6 +11,7 @@ * */ +#include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <asm/uaccess.h> @@ -22,6 +23,7 @@ #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> +#include <net/route.h> /* * Write options to IP header, record destination address to diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index eba64e2bd397..3324fbfe528a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -69,6 +69,7 @@ #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> +#include <net/xfrm.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> @@ -85,6 +86,8 @@ int sysctl_ip_default_ttl = IPDEFTTL; +static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)); + /* Generate a checksum for an outgoing IP datagram. */ __inline__ void ip_send_check(struct iphdr *iph) { @@ -202,13 +205,16 @@ static inline int ip_finish_output2(struct sk_buff *skb) static inline int ip_finish_output(struct sk_buff *skb) { - struct net_device *dev = skb->dst->dev; - - skb->dev = dev; - skb->protocol = htons(ETH_P_IP); - - return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, - ip_finish_output2); +#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) + /* Policy lookup after SNAT yielded a new policy */ + if (skb->dst->xfrm != NULL) + return xfrm4_output_finish(skb); +#endif + if (skb->len > dst_mtu(skb->dst) && + !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) + return ip_fragment(skb, ip_finish_output2); + else + return ip_finish_output2(skb); } int ip_mc_output(struct sk_buff *skb) @@ -265,21 +271,21 @@ int ip_mc_output(struct sk_buff *skb) newskb->dev, ip_dev_loopback_xmit); } - if (skb->len > dst_mtu(&rt->u.dst)) - return ip_fragment(skb, ip_finish_output); - else - return ip_finish_output(skb); + return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, + ip_finish_output); } int ip_output(struct sk_buff *skb) { + struct net_device *dev = skb->dst->dev; + IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - if (skb->len > dst_mtu(skb->dst) && - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) - return ip_fragment(skb, ip_finish_output); - else - return ip_finish_output(skb); + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + + return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + ip_finish_output); } int ip_queue_xmit(struct sk_buff *skb, int ipfragok) @@ -411,7 +417,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) * single device frame, and queue such a frame for sending. */ -int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) +static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) { struct iphdr *iph; int raw = 0; @@ -420,7 +426,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) struct sk_buff *skb2; unsigned int mtu, hlen, left, len, ll_rs; int offset; - int not_last_frag; + __be16 not_last_frag; struct rtable *rt = (struct rtable*)skb->dst; int err = 0; @@ -445,6 +451,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) hlen = iph->ihl * 4; mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing @@ -1181,7 +1188,7 @@ int ip_push_pending_frames(struct sock *sk) struct ip_options *opt = NULL; struct rtable *rt = inet->cork.rt; struct iphdr *iph; - int df = 0; + __be16 df = 0; __u8 ttl; int err = 0; @@ -1392,7 +1399,6 @@ void __init ip_init(void) #endif } -EXPORT_SYMBOL(ip_fragment); EXPORT_SYMBOL(ip_generic_getfrag); EXPORT_SYMBOL(ip_queue_xmit); EXPORT_SYMBOL(ip_send_check); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 4f2d87257309..2bf8d782f678 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -25,12 +25,12 @@ #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> -#include <net/tcp.h> -#include <linux/tcp.h> +#include <net/tcp_states.h> #include <linux/udp.h> #include <linux/igmp.h> #include <linux/netfilter.h> @@ -427,8 +427,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, err = ip_options_get_from_user(&opt, optval, optlen); if (err) break; - if (sk->sk_type == SOCK_STREAM) { - struct tcp_sock *tp = tcp_sk(sk); + if (inet->is_icsk) { + struct inet_connection_sock *icsk = inet_csk(sk); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (sk->sk_family == PF_INET || (!((1 << sk->sk_state) & @@ -436,10 +436,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, inet->daddr != LOOPBACK4_IPV6)) { #endif if (inet->opt) - tp->ext_header_len -= inet->opt->optlen; + icsk->icsk_ext_hdr_len -= inet->opt->optlen; if (opt) - tp->ext_header_len += opt->optlen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len += opt->optlen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) } #endif @@ -621,7 +621,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, err = -ENOBUFS; break; } - msf = (struct ip_msfilter *)kmalloc(optlen, GFP_KERNEL); + msf = kmalloc(optlen, GFP_KERNEL); if (msf == 0) { err = -ENOBUFS; break; @@ -778,7 +778,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, err = -ENOBUFS; break; } - gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL); + gsf = kmalloc(optlen,GFP_KERNEL); if (gsf == 0) { err = -ENOBUFS; break; @@ -798,7 +798,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, goto mc_msf_out; } msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); - msf = (struct ip_msfilter *)kmalloc(msize,GFP_KERNEL); + msf = kmalloc(msize,GFP_KERNEL); if (msf == 0) { err = -ENOBUFS; goto mc_msf_out; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index fc718df17b40..d64e2ec8da7b 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -28,6 +28,7 @@ #include <net/xfrm.h> #include <net/icmp.h> #include <net/ipcomp.h> +#include <net/protocol.h> struct ipcomp_tfms { struct list_head list; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index e8674baaa8d9..bb3613ec448c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -42,6 +42,7 @@ #include <linux/in.h> #include <linux/if.h> #include <linux/inet.h> +#include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> @@ -58,6 +59,7 @@ #include <net/arp.h> #include <net/ip.h> #include <net/ipconfig.h> +#include <net/route.h> #include <asm/uaccess.h> #include <net/checksum.h> diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index c05c1df0bb04..e5cbe72c6b80 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -93,6 +93,7 @@ */ +#include <linux/capability.h> #include <linux/config.h> #include <linux/module.h> #include <linux/types.h> @@ -108,6 +109,7 @@ #include <linux/mroute.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> +#include <linux/if_ether.h> #include <net/sock.h> #include <net/ip.h> @@ -243,7 +245,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c if (dev == NULL) return NULL; - nt = dev->priv; + nt = netdev_priv(dev); SET_MODULE_OWNER(dev); dev->init = ipip_tunnel_init; nt->parms = *parms; @@ -268,7 +270,7 @@ static void ipip_tunnel_uninit(struct net_device *dev) tunnels_wc[0] = NULL; write_unlock_bh(&ipip_lock); } else - ipip_tunnel_unlink((struct ip_tunnel*)dev->priv); + ipip_tunnel_unlink(netdev_priv(dev)); dev_put(dev); } @@ -442,7 +444,7 @@ out: skb2->dst->ops->update_pmtu(skb2->dst, rel_info); rel_info = htonl(rel_info); } else if (type == ICMP_TIME_EXCEEDED) { - struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; + struct ip_tunnel *t = netdev_priv(skb2->dev); if (t->parms.iph.ttl) { rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; @@ -513,7 +515,7 @@ out: static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { - struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct ip_tunnel *tunnel = netdev_priv(dev); struct net_device_stats *stats = &tunnel->stat; struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; @@ -620,6 +622,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb->h.raw = skb->nh.raw; skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED); dst_release(skb->dst); skb->dst = &rt->u.dst; @@ -672,7 +675,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) t = ipip_tunnel_locate(&p, 0); } if (t == NULL) - t = (struct ip_tunnel*)dev->priv; + t = netdev_priv(dev); memcpy(&p, &t->parms, sizeof(p)); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; @@ -709,7 +712,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = -EINVAL; break; } - t = (struct ip_tunnel*)dev->priv; + t = netdev_priv(dev); ipip_tunnel_unlink(t); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; @@ -763,7 +766,7 @@ done: static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev) { - return &(((struct ip_tunnel*)dev->priv)->stat); + return &(((struct ip_tunnel*)netdev_priv(dev))->stat); } static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) @@ -786,7 +789,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->type = ARPHRD_TUNNEL; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); - dev->mtu = 1500 - sizeof(struct iphdr); + dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; @@ -798,7 +801,7 @@ static int ipip_tunnel_init(struct net_device *dev) struct ip_tunnel *tunnel; struct iphdr *iph; - tunnel = (struct ip_tunnel*)dev->priv; + tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; tunnel->dev = dev; @@ -836,7 +839,7 @@ static int ipip_tunnel_init(struct net_device *dev) static int __init ipip_fb_tunnel_init(struct net_device *dev) { - struct ip_tunnel *tunnel = dev->priv; + struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; tunnel->dev = dev; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 302b7eb507c9..5c94c222e3f3 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -33,6 +33,7 @@ #include <asm/uaccess.h> #include <linux/types.h> #include <linux/sched.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/mm.h> @@ -49,9 +50,11 @@ #include <linux/seq_file.h> #include <linux/mroute.h> #include <linux/init.h> +#include <linux/if_ether.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> +#include <net/route.h> #include <net/sock.h> #include <net/icmp.h> #include <net/udp.h> @@ -176,8 +179,8 @@ static int reg_vif_num = -1; static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { read_lock(&mrt_lock); - ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len; - ((struct net_device_stats*)dev->priv)->tx_packets++; + ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len; + ((struct net_device_stats*)netdev_priv(dev))->tx_packets++; ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); @@ -186,13 +189,13 @@ static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) static struct net_device_stats *reg_vif_get_stats(struct net_device *dev) { - return (struct net_device_stats*)dev->priv; + return (struct net_device_stats*)netdev_priv(dev); } static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; - dev->mtu = 1500 - sizeof(struct iphdr) - 8; + dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; dev->hard_start_xmit = reg_vif_xmit; dev->get_stats = reg_vif_get_stats; @@ -1147,8 +1150,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) if (vif->flags & VIFF_REGISTER) { vif->pkt_out++; vif->bytes_out+=skb->len; - ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len; - ((struct net_device_stats*)vif->dev->priv)->tx_packets++; + ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len; + ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++; ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); kfree_skb(skb); return; @@ -1208,8 +1211,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) if (vif->flags & VIFF_TUNNEL) { ip_encap(skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ - ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++; - ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len; + ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++; + ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len; } IPCB(skb)->flags |= IPSKB_FORWARDED; @@ -1465,8 +1468,8 @@ int pim_rcv_v1(struct sk_buff * skb) skb->pkt_type = PACKET_HOST; dst_release(skb->dst); skb->dst = NULL; - ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; - ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len; + ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++; nf_reset(skb); netif_rx(skb); dev_put(reg_dev); @@ -1520,8 +1523,8 @@ static int pim_rcv(struct sk_buff * skb) skb->ip_summed = 0; skb->pkt_type = PACKET_HOST; dst_release(skb->dst); - ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; - ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len; + ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++; skb->dst = NULL; nf_reset(skb); netif_rx(skb); diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index d7eb680101c2..9b176a942ac5 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -224,34 +224,6 @@ void unregister_ip_vs_app(struct ip_vs_app *app) } -#if 0000 -/* - * Get reference to app by name (called from user context) - */ -struct ip_vs_app *ip_vs_app_get_by_name(char *appname) -{ - struct ip_vs_app *app, *a = NULL; - - down(&__ip_vs_app_mutex); - - list_for_each_entry(ent, &ip_vs_app_list, a_list) { - if (strcmp(app->name, appname)) - continue; - - /* softirq may call ip_vs_app_get too, so the caller - must disable softirq on the current CPU */ - if (ip_vs_app_get(app)) - a = app; - break; - } - - up(&__ip_vs_app_mutex); - - return a; -} -#endif - - /* * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) */ diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 2a3a8c59c655..87b83813cf2c 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -24,7 +24,11 @@ * */ +#include <linux/interrupt.h> +#include <linux/in.h> +#include <linux/net.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/vmalloc.h> #include <linux/proc_fs.h> /* for proc_net_* */ #include <linux/seq_file.h> @@ -219,7 +223,7 @@ struct ip_vs_conn *ip_vs_conn_in_get if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); - IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -254,7 +258,7 @@ struct ip_vs_conn *ip_vs_ct_in_get out: ct_read_unlock(hash); - IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -295,7 +299,7 @@ struct ip_vs_conn *ip_vs_conn_out_get ct_read_unlock(hash); - IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -391,8 +395,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) cp->flags |= atomic_read(&dest->conn_flags); cp->dest = dest; - IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), @@ -430,8 +435,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) if (!dest) return; - IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), @@ -571,7 +577,7 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_conn_hash(cp); expire_later: - IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", + IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", atomic_read(&cp->refcnt)-1, atomic_read(&cp->n_control)); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 1a0843cd58a9..3f47ad8e1cad 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -426,7 +426,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) return NULL; IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " - "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", + "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), @@ -532,11 +532,8 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum, { if (!((*pskb)->ipvs_property)) return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - (*okfn)(*pskb); - - return NF_STOLEN; + return NF_STOP; } u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 9bdcf31b760e..7f0288b25fa1 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -23,6 +23,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> +#include <linux/capability.h> #include <linux/fs.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> @@ -35,6 +36,7 @@ #include <linux/netfilter_ipv4.h> #include <net/ip.h> +#include <net/route.h> #include <net/sock.h> #include <asm/uaccess.h> @@ -447,7 +449,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) out: read_unlock(&__ip_vs_svc_lock); - IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", + IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", fwmark, ip_vs_proto_name(protocol), NIPQUAD(vaddr), ntohs(vport), svc?"hit":"not hit"); @@ -597,7 +599,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) */ list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " - "refcnt=%d\n", + "dest->refcnt=%d\n", dest->vfwmark, NIPQUAD(dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); @@ -804,7 +806,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) dest = ip_vs_trash_get_dest(svc, daddr, dport); if (dest != NULL) { IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " - "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", + "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", NIPQUAD(daddr), ntohs(dport), atomic_read(&dest->refcnt), dest->vfwmark, @@ -949,7 +951,8 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) atomic_dec(&dest->svc->refcnt); kfree(dest); } else { - IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", + IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " + "dest->refcnt=%d\n", NIPQUAD(dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); list_add(&dest->n_list, &ip_vs_dest_trash); diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index f3bc320dce93..9fee19c4c617 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -37,8 +37,10 @@ * */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 67b3e2fc1fa1..c453e1e57f4b 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -13,8 +13,12 @@ * Changes: * */ +#include <linux/config.h> #include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/slab.h> #include <linux/types.h> +#include <linux/interrupt.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 561cda326fa8..6e5cb92a5c83 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -41,8 +41,10 @@ * me to write this module. */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> /* for sysctl */ #include <linux/fs.h> @@ -228,33 +230,6 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) } -#if 0000 -/* - * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. - * returns bool success. - */ -static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, - struct ip_vs_lblc_entry *en) -{ - if (list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Remove it from the table - */ - write_lock(&tbl->lock); - list_del(&en->list); - INIT_LIST_HEAD(&en->list); - write_unlock(&tbl->lock); - - return 1; -} -#endif - - /* * Get ip_vs_lblc_entry associated with supplied parameters. */ diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index ce456dbf09a5..32ba37ba72d8 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -39,8 +39,10 @@ * */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> /* for sysctl */ #include <linux/fs.h> @@ -414,33 +416,6 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) } -#if 0000 -/* - * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. - * returns bool success. - */ -static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, - struct ip_vs_lblcr_entry *en) -{ - if (list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Remove it from the table - */ - write_lock(&tbl->lock); - list_del(&en->list); - INIT_LIST_HEAD(&en->list); - write_unlock(&tbl->lock); - - return 1; -} -#endif - - /* * Get ip_vs_lblcr_entry associated with supplied parameters. */ diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 453e94a0bbd7..8b0505b09317 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -12,6 +12,8 @@ * */ +#include <linux/in.h> +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netfilter.h> diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index 478e5c7c7e8e..c36ccf057a19 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -12,6 +12,8 @@ * */ +#include <linux/in.h> +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netfilter.h> diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 0e878fd6215c..bc28b1160a3a 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -275,28 +275,6 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_LAST] = 2*HZ, }; - -#if 0 - -/* FIXME: This is going to die */ - -static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = 2*HZ, - [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ, - [IP_VS_TCP_S_SYN_SENT] = 60*HZ, - [IP_VS_TCP_S_SYN_RECV] = 10*HZ, - [IP_VS_TCP_S_FIN_WAIT] = 60*HZ, - [IP_VS_TCP_S_TIME_WAIT] = 60*HZ, - [IP_VS_TCP_S_CLOSE] = 10*HZ, - [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, - [IP_VS_TCP_S_LAST_ACK] = 30*HZ, - [IP_VS_TCP_S_LISTEN] = 2*60*HZ, - [IP_VS_TCP_S_SYNACK] = 100*HZ, - [IP_VS_TCP_S_LAST] = 2*HZ, -}; - -#endif - static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = "NONE", [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", @@ -448,7 +426,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_dest *dest = cp->dest; IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" - "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", + "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n", pp->name, (state_off==TCP_DIR_OUTPUT)?"output ":"input ", th->syn? 'S' : '.', diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 8ae5f2e0aefa..89d9175d8f28 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -15,8 +15,11 @@ * */ +#include <linux/in.h> +#include <linux/ip.h> #include <linux/kernel.h> #include <linux/netfilter_ipv4.h> +#include <linux/udp.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c index 0f7c56a225bd..8bc42b76223d 100644 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ b/net/ipv4/ipvs/ip_vs_sched.c @@ -22,6 +22,7 @@ #include <linux/module.h> #include <linux/sched.h> #include <linux/spinlock.h> +#include <linux/interrupt.h> #include <asm/string.h> #include <linux/kmod.h> diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 6f7c50e44a39..7775e6cc68be 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -34,8 +34,10 @@ * */ +#include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/skbuff.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 2e5ced3d8062..1bca714bda3d 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -21,12 +21,14 @@ #include <linux/module.h> #include <linux/slab.h> +#include <linux/inetdevice.h> #include <linux/net.h> #include <linux/completion.h> #include <linux/delay.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/igmp.h> /* for ip_mc_join_group */ +#include <linux/udp.h> #include <net/ip.h> #include <net/sock.h> diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index 3b87482049cf..52c12e9edbbc 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -322,7 +322,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = skb->nh.iph; u8 tos = old_iph->tos; - u16 df = old_iph->frag_off; + __be16 df = old_iph->frag_off; struct iphdr *iph; /* Our new IP header */ int max_headroom; /* The extra header space needed */ int mtu; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index ae0779d82c5d..52a3d7c57907 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -1,17 +1,11 @@ /* IPv4 specific functions of netfilter core */ - -#include <linux/config.h> -#ifdef CONFIG_NETFILTER - #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> - -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/icmp.h> -#include <net/route.h> #include <linux/ip.h> +#include <net/route.h> +#include <net/xfrm.h> +#include <net/ip.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct sk_buff **pskb) @@ -33,7 +27,6 @@ int ip_route_me_harder(struct sk_buff **pskb) #ifdef CONFIG_IP_ROUTE_FWMARK fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; #endif - fl.proto = iph->protocol; if (ip_route_output_key(&rt, &fl) != 0) return -1; @@ -60,6 +53,13 @@ int ip_route_me_harder(struct sk_buff **pskb) if ((*pskb)->dst->error) return -1; +#ifdef CONFIG_XFRM + if (!(IPCB(*pskb)->flags & IPSKB_XFRM_TRANSFORMED) && + xfrm_decode_session(*pskb, &fl, AF_INET) == 0) + if (xfrm_lookup(&(*pskb)->dst, &fl, (*pskb)->sk, 0)) + return -1; +#endif + /* Change in oif may mean change in hh_len. */ hh_len = (*pskb)->dst->dev->hard_header_len; if (skb_headroom(*pskb) < hh_len) { @@ -78,6 +78,9 @@ int ip_route_me_harder(struct sk_buff **pskb) } EXPORT_SYMBOL(ip_route_me_harder); +void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *); +EXPORT_SYMBOL(ip_nat_decode_session); + /* * Extra routing may needed on local out, as the QUEUE target never * returns control to the table. @@ -135,5 +138,3 @@ static void fini(void) module_init(init); module_exit(fini); - -#endif /* CONFIG_NETFILTER */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 88a60650e6b8..db783036e4d8 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -182,6 +182,7 @@ config IP_NF_QUEUE config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" + depends on NETFILTER_XTABLES help iptables is a general, extensible packet identification framework. The packet filtering and full NAT (masquerading, port forwarding, @@ -191,16 +192,6 @@ config IP_NF_IPTABLES To compile it as a module, choose M here. If unsure, say N. # The matches. -config IP_NF_MATCH_LIMIT - tristate "limit match support" - depends on IP_NF_IPTABLES - help - limit matching allows you to control the rate at which a rule can be - matched: mainly useful in combination with the LOG target ("LOG - target support", below) and to avoid some Denial of Service attacks. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_MATCH_IPRANGE tristate "IP range match support" depends on IP_NF_IPTABLES @@ -210,37 +201,6 @@ config IP_NF_MATCH_IPRANGE To compile it as a module, choose M here. If unsure, say N. -config IP_NF_MATCH_MAC - tristate "MAC address match support" - depends on IP_NF_IPTABLES - help - MAC matching allows you to match packets based on the source - Ethernet address of the packet. - - To compile it as a module, choose M here. If unsure, say N. - -config IP_NF_MATCH_PKTTYPE - tristate "Packet type match support" - depends on IP_NF_IPTABLES - help - Packet type matching allows you to match a packet by - its "class", eg. BROADCAST, MULTICAST, ... - - Typical usage: - iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG - - To compile it as a module, choose M here. If unsure, say N. - -config IP_NF_MATCH_MARK - tristate "netfilter MARK match support" - depends on IP_NF_IPTABLES - help - Netfilter mark matching allows you to match packets based on the - `nfmark' value in the packet. This can be set by the MARK target - (see below). - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_MATCH_MULTIPORT tristate "Multiple port match support" depends on IP_NF_IPTABLES @@ -301,15 +261,6 @@ config IP_NF_MATCH_AH_ESP To compile it as a module, choose M here. If unsure, say N. -config IP_NF_MATCH_LENGTH - tristate "LENGTH match support" - depends on IP_NF_IPTABLES - help - This option allows you to match the length of a packet against a - specific value or range of values. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_MATCH_TTL tristate "TTL match support" depends on IP_NF_IPTABLES @@ -319,50 +270,6 @@ config IP_NF_MATCH_TTL To compile it as a module, choose M here. If unsure, say N. -config IP_NF_MATCH_TCPMSS - tristate "tcpmss match support" - depends on IP_NF_IPTABLES - help - This option adds a `tcpmss' match, which allows you to examine the - MSS value of TCP SYN packets, which control the maximum packet size - for that connection. - - To compile it as a module, choose M here. If unsure, say N. - -config IP_NF_MATCH_HELPER - tristate "Helper match support" - depends on IP_NF_IPTABLES - depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 - help - Helper matching allows you to match packets in dynamic connections - tracked by a conntrack-helper, ie. ip_conntrack_ftp - - To compile it as a module, choose M here. If unsure, say Y. - -config IP_NF_MATCH_STATE - tristate "Connection state match support" - depends on IP_NF_IPTABLES - depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 - help - Connection state matching allows you to match packets based on their - relationship to a tracked connection (ie. previous packets). This - is a powerful tool for packet classification. - - To compile it as a module, choose M here. If unsure, say N. - -config IP_NF_MATCH_CONNTRACK - tristate "Connection tracking match support" - depends on IP_NF_IPTABLES - depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 - help - This is a general conntrack match module, a superset of the state match. - - It allows matching on additional conntrack information, which is - useful in complex configurations, such as NAT gateways with multiple - internet links or tunnels. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_MATCH_OWNER tristate "Owner match support" depends on IP_NF_IPTABLES @@ -372,15 +279,6 @@ config IP_NF_MATCH_OWNER To compile it as a module, choose M here. If unsure, say N. -config IP_NF_MATCH_PHYSDEV - tristate "Physdev match support" - depends on IP_NF_IPTABLES && BRIDGE_NETFILTER - help - Physdev packet matching matches against the physical bridge ports - the IP packet arrived on or will leave by. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_MATCH_ADDRTYPE tristate 'address type match support' depends on IP_NF_IPTABLES @@ -391,75 +289,6 @@ config IP_NF_MATCH_ADDRTYPE If you want to compile it as a module, say M here and read <file:Documentation/modules.txt>. If unsure, say `N'. -config IP_NF_MATCH_REALM - tristate 'realm match support' - depends on IP_NF_IPTABLES - select NET_CLS_ROUTE - help - This option adds a `realm' match, which allows you to use the realm - key from the routing subsystem inside iptables. - - This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option - in tc world. - - If you want to compile it as a module, say M here and read - <file:Documentation/modules.txt>. If unsure, say `N'. - -config IP_NF_MATCH_SCTP - tristate 'SCTP protocol match support' - depends on IP_NF_IPTABLES - help - With this option enabled, you will be able to use the iptables - `sctp' match in order to match on SCTP source/destination ports - and SCTP chunk types. - - If you want to compile it as a module, say M here and read - <file:Documentation/modules.txt>. If unsure, say `N'. - -config IP_NF_MATCH_DCCP - tristate 'DCCP protocol match support' - depends on IP_NF_IPTABLES - help - With this option enabled, you will be able to use the iptables - `dccp' match in order to match on DCCP source/destination ports - and DCCP flags. - - If you want to compile it as a module, say M here and read - <file:Documentation/modules.txt>. If unsure, say `N'. - -config IP_NF_MATCH_COMMENT - tristate 'comment match support' - depends on IP_NF_IPTABLES - help - This option adds a `comment' dummy-match, which allows you to put - comments in your iptables ruleset. - - If you want to compile it as a module, say M here and read - <file:Documentation/modules.txt>. If unsure, say `N'. - -config IP_NF_MATCH_CONNMARK - tristate 'Connection mark match support' - depends on IP_NF_IPTABLES - depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) - help - This option adds a `connmark' match, which allows you to match the - connection mark value previously set for the session by `CONNMARK'. - - If you want to compile it as a module, say M here and read - <file:Documentation/modules.txt>. The module will be called - ipt_connmark.o. If unsure, say `N'. - -config IP_NF_MATCH_CONNBYTES - tristate 'Connection byte/packet counter match support' - depends on IP_NF_IPTABLES - depends on (IP_NF_CONNTRACK && IP_NF_CT_ACCT) || (NF_CT_ACCT && NF_CONNTRACK_IPV4) - help - This option adds a `connbytes' match, which allows you to match the - number of bytes and/or packets for each direction within a connection. - - If you want to compile it as a module, say M here and read - <file:Documentation/modules.txt>. If unsure, say `N'. - config IP_NF_MATCH_HASHLIMIT tristate 'hashlimit match support' depends on IP_NF_IPTABLES @@ -474,18 +303,15 @@ config IP_NF_MATCH_HASHLIMIT destination IP' or `500pps from any given source IP' with a single IPtables rule. -config IP_NF_MATCH_STRING - tristate 'string match support' - depends on IP_NF_IPTABLES - select TEXTSEARCH - select TEXTSEARCH_KMP - select TEXTSEARCH_BM - select TEXTSEARCH_FSM - help - This option adds a `string' match, which allows you to look for - pattern matchings in packets. +config IP_NF_MATCH_POLICY + tristate "IPsec policy match support" + depends on IP_NF_IPTABLES && XFRM + help + Policy matching allows you to match packets based on the + IPsec policy that was used during decapsulation/will + be used during encapsulation. - To compile it as a module, choose M here. If unsure, say N. + To compile it as a module, choose M here. If unsure, say N. # `filter', generic and specific targets config IP_NF_FILTER @@ -562,17 +388,6 @@ config IP_NF_TARGET_TCPMSS To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_NFQUEUE - tristate "NFQUEUE Target Support" - depends on IP_NF_IPTABLES - help - This Target replaced the old obsolete QUEUE target. - - As opposed to QUEUE, it supports 65535 different queues, - not just one. - - To compile it as a module, choose M here. If unsure, say N. - # NAT + specific targets config IP_NF_NAT tristate "Full NAT" @@ -725,31 +540,6 @@ config IP_NF_TARGET_DSCP To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_MARK - tristate "MARK target support" - depends on IP_NF_MANGLE - help - This option adds a `MARK' target, which allows you to create rules - in the `mangle' table which alter the netfilter mark (nfmark) field - associated with the packet prior to routing. This can change - the routing method (see `Use netfilter MARK value as routing - key') and can also be used by other subsystems to change their - behavior. - - To compile it as a module, choose M here. If unsure, say N. - -config IP_NF_TARGET_CLASSIFY - tristate "CLASSIFY target support" - depends on IP_NF_MANGLE - help - This option adds a `CLASSIFY' target, which enables the user to set - the priority of a packet. Some qdiscs can use this value for - classification, among these are: - - atm, cbq, dsmark, pfifo_fast, htb, prio - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_TARGET_TTL tristate 'TTL target support' depends on IP_NF_MANGLE @@ -764,19 +554,6 @@ config IP_NF_TARGET_TTL To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_CONNMARK - tristate 'CONNMARK target support' - depends on IP_NF_MANGLE - depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) - help - This option adds a `CONNMARK' target, which allows one to manipulate - the connection mark value. Similar to the MARK target, but - affects the connection mark value rather than the packet mark value. - - If you want to compile it as a module, say M here and read - <file:Documentation/modules.txt>. The module will be called - ipt_CONNMARK.o. If unsure, say `N'. - config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" depends on IP_NF_MANGLE && EXPERIMENTAL @@ -800,23 +577,10 @@ config IP_NF_RAW If you want to compile it as a module, say M here and read <file:Documentation/modules.txt>. If unsure, say `N'. -config IP_NF_TARGET_NOTRACK - tristate 'NOTRACK target support' - depends on IP_NF_RAW - depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 - help - The NOTRACK target allows a select rule to specify - which packets *not* to enter the conntrack/NAT - subsystem with all the consequences (no ICMP error tracking, - no protocol helpers for the selected packets). - - If you want to compile it as a module, say M here and read - <file:Documentation/modules.txt>. If unsure, say `N'. - - # ARP tables config IP_NF_ARPTABLES tristate "ARP tables support" + depends on NETFILTER_XTABLES help arptables is a general, extensible packet identification framework. The ARP packet filtering and mangling (manipulation)subsystems diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index d0a447e520a2..bcefe64b9317 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -47,14 +47,8 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o # matches obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o -obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o -obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o -obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o -obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o -obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o -obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o @@ -62,39 +56,25 @@ obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o -obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o -obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o -obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o -obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o -obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o -obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o -obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o -obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o -obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o -obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o +obj-$(CONFIG_IP_NF_MATCH_POLICY) += ipt_policy.o # targets obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o -obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o -obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o -obj-$(CONFIG_IP_NF_TARGET_CONNMARK) += ipt_CONNMARK.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o -obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o -obj-$(CONFIG_IP_NF_TARGET_NFQUEUE) += ipt_NFQUEUE.o # generic ARP tables obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3c2e9639bba6..afe3d8f8177d 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> +#include <linux/capability.h> #include <linux/if_arp.h> #include <linux/kmod.h> #include <linux/vmalloc.h> @@ -23,6 +24,7 @@ #include <asm/uaccess.h> #include <asm/semaphore.h> +#include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp/arp_tables.h> MODULE_LICENSE("GPL"); @@ -54,33 +56,9 @@ do { \ #else #define ARP_NF_ASSERT(x) #endif -#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) -static DECLARE_MUTEX(arpt_mutex); - -#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) -#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) #include <linux/netfilter_ipv4/listhelp.h> -struct arpt_table_info { - unsigned int size; - unsigned int number; - unsigned int initial_entries; - unsigned int hook_entry[NF_ARP_NUMHOOKS]; - unsigned int underflow[NF_ARP_NUMHOOKS]; - char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); -}; - -static LIST_HEAD(arpt_target); -static LIST_HEAD(arpt_tables); -#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) - -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, char *hdr_addr, int len) { @@ -227,9 +205,9 @@ static inline int arp_checkentry(const struct arpt_arp *arp) } static unsigned int arpt_error(struct sk_buff **pskb, - unsigned int hooknum, const struct net_device *in, const struct net_device *out, + unsigned int hooknum, const void *targinfo, void *userinfo) { @@ -258,6 +236,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb, struct arpt_entry *e, *back; const char *indev, *outdev; void *table_base; + struct xt_table_info *private = table->private; /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ if (!pskb_may_pull((*pskb), (sizeof(struct arphdr) + @@ -269,11 +248,9 @@ unsigned int arpt_do_table(struct sk_buff **pskb, outdev = out ? out->name : nulldevname; read_lock_bh(&table->lock); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, - smp_processor_id()); - e = get_entry(table_base, table->private->hook_entry[hook]); - back = get_entry(table_base, table->private->underflow[hook]); + table_base = (void *)private->entries[smp_processor_id()]; + e = get_entry(table_base, private->hook_entry[hook]); + back = get_entry(table_base, private->underflow[hook]); arp = (*pskb)->nh.arph; do { @@ -321,8 +298,8 @@ unsigned int arpt_do_table(struct sk_buff **pskb, * abs. verdicts */ verdict = t->u.kernel.target->target(pskb, - hook, in, out, + hook, t->data, userdata); @@ -347,106 +324,6 @@ unsigned int arpt_do_table(struct sk_buff **pskb, return verdict; } -/* - * These are weird, but module loading must not be done with mutex - * held (since they will register), and we have to have a single - * function to use try_then_request_module(). - */ - -/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ -static inline struct arpt_table *find_table_lock(const char *name) -{ - struct arpt_table *t; - - if (down_interruptible(&arpt_mutex) != 0) - return ERR_PTR(-EINTR); - - list_for_each_entry(t, &arpt_tables, list) - if (strcmp(t->name, name) == 0 && try_module_get(t->me)) - return t; - up(&arpt_mutex); - return NULL; -} - - -/* Find target, grabs ref. Returns ERR_PTR() on error. */ -static inline struct arpt_target *find_target(const char *name, u8 revision) -{ - struct arpt_target *t; - int err = 0; - - if (down_interruptible(&arpt_mutex) != 0) - return ERR_PTR(-EINTR); - - list_for_each_entry(t, &arpt_target, list) { - if (strcmp(t->name, name) == 0) { - if (t->revision == revision) { - if (try_module_get(t->me)) { - up(&arpt_mutex); - return t; - } - } else - err = -EPROTOTYPE; /* Found something. */ - } - } - up(&arpt_mutex); - return ERR_PTR(err); -} - -struct arpt_target *arpt_find_target(const char *name, u8 revision) -{ - struct arpt_target *target; - - target = try_then_request_module(find_target(name, revision), - "arpt_%s", name); - if (IS_ERR(target) || !target) - return NULL; - return target; -} - -static int target_revfn(const char *name, u8 revision, int *bestp) -{ - struct arpt_target *t; - int have_rev = 0; - - list_for_each_entry(t, &arpt_target, list) { - if (strcmp(t->name, name) == 0) { - if (t->revision > *bestp) - *bestp = t->revision; - if (t->revision == revision) - have_rev =1; - } - } - return have_rev; -} - -/* Returns true or false (if no such extension at all) */ -static inline int find_revision(const char *name, u8 revision, - int (*revfn)(const char *, u8, int *), - int *err) -{ - int have_rev, best = -1; - - if (down_interruptible(&arpt_mutex) != 0) { - *err = -EINTR; - return 1; - } - have_rev = revfn(name, revision, &best); - up(&arpt_mutex); - - /* Nothing at all? Return 0 to try loading module. */ - if (best == -1) { - *err = -ENOENT; - return 0; - } - - *err = best; - if (!have_rev) - *err = -EPROTONOSUPPORT; - return 1; -} - - /* All zeroes == unconditional rule. */ static inline int unconditional(const struct arpt_arp *arp) { @@ -462,7 +339,8 @@ static inline int unconditional(const struct arpt_arp *arp) /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ -static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) +static int mark_source_chains(struct xt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -472,7 +350,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct arpt_entry *e - = (struct arpt_entry *)(newinfo->entries + pos); + = (struct arpt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -514,13 +392,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali goto next; e = (struct arpt_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct arpt_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -537,7 +415,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali newpos = pos + e->next_offset; } e = (struct arpt_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -592,8 +470,8 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i } t = arpt_get_target(e); - target = try_then_request_module(find_target(t->u.user.name, - t->u.user.revision), + target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name, + t->u.user.revision), "arpt_%s", t->u.user.name); if (IS_ERR(target) || !target) { duprintf("check_entry: `%s' not found\n", t->u.user.name); @@ -627,7 +505,7 @@ out: } static inline int check_entry_size_and_hooks(struct arpt_entry *e, - struct arpt_table_info *newinfo, + struct xt_table_info *newinfo, unsigned char *base, unsigned char *limit, const unsigned int *hook_entries, @@ -661,7 +539,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, < 0 (not ARPT_RETURN). --RR */ /* Clear counters and comefrom */ - e->counters = ((struct arpt_counters) { 0, 0 }); + e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; (*i)++; @@ -688,7 +566,8 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) */ static int translate_table(const char *name, unsigned int valid_hooks, - struct arpt_table_info *newinfo, + struct xt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -710,11 +589,11 @@ static int translate_table(const char *name, i = 0; /* Walk through entries, checking offsets. */ - ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) @@ -743,79 +622,78 @@ static int translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) { + if (!mark_source_chains(newinfo, valid_hooks, entry0)) { duprintf("Looping hook\n"); return -ELOOP; } /* Finally, each sanity check must pass */ i = 0; - ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ARPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; } -static struct arpt_table_info *replace_table(struct arpt_table *table, - unsigned int num_counters, - struct arpt_table_info *newinfo, - int *error) +/* Gets counters. */ +static inline int add_entry_to_counter(const struct arpt_entry *e, + struct xt_counters total[], + unsigned int *i) { - struct arpt_table_info *oldinfo; - - /* Do the substitution. */ - write_lock_bh(&table->lock); - /* Check inside lock: is the old number correct? */ - if (num_counters != table->private->number) { - duprintf("num_counters != table->private->number (%u/%u)\n", - num_counters, table->private->number); - write_unlock_bh(&table->lock); - *error = -EAGAIN; - return NULL; - } - oldinfo = table->private; - table->private = newinfo; - newinfo->initial_entries = oldinfo->initial_entries; - write_unlock_bh(&table->lock); + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - return oldinfo; + (*i)++; + return 0; } -/* Gets counters. */ -static inline int add_entry_to_counter(const struct arpt_entry *e, - struct arpt_counters total[], +static inline int set_entry_to_counter(const struct arpt_entry *e, + struct xt_counters total[], unsigned int *i) { - ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); (*i)++; return 0; } -static void get_counters(const struct arpt_table_info *t, - struct arpt_counters counters[]) +static void get_counters(const struct xt_table_info *t, + struct xt_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + ARPT_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + ARPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -829,27 +707,29 @@ static int copy_entries_to_user(unsigned int total_size, { unsigned int off, num, countersize; struct arpt_entry *e; - struct arpt_counters *counters; + struct xt_counters *counters; + struct xt_table_info *private = table->private; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care * about). */ - countersize = sizeof(struct arpt_counters) * table->private->number; - counters = vmalloc(countersize); + countersize = sizeof(struct xt_counters) * private->number; + counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); - get_counters(table->private, counters); + get_counters(private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -859,7 +739,7 @@ static int copy_entries_to_user(unsigned int total_size, for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ struct arpt_entry_target *t; - e = (struct arpt_entry *)(table->private->entries + off); + e = (struct arpt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct arpt_entry, counters), &counters[num], @@ -890,21 +770,21 @@ static int get_entries(const struct arpt_get_entries *entries, int ret; struct arpt_table *t; - t = find_table_lock(entries->name); + t = xt_find_table_lock(NF_ARP, entries->name); if (t || !IS_ERR(t)) { + struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", - t->private->number); - if (entries->size == t->private->size) - ret = copy_entries_to_user(t->private->size, + private->number); + if (entries->size == private->size) + ret = copy_entries_to_user(private->size, t, uptr->entrytable); else { duprintf("get_entries: I've got %u not %u!\n", - t->private->size, - entries->size); + private->size, entries->size); ret = -EINVAL; } module_put(t->me); - up(&arpt_mutex); + xt_table_unlock(t); } else ret = t ? PTR_ERR(t) : -ENOENT; @@ -916,8 +796,9 @@ static int do_replace(void __user *user, unsigned int len) int ret; struct arpt_replace tmp; struct arpt_table *t; - struct arpt_table_info *newinfo, *oldinfo; - struct arpt_counters *counters; + struct xt_table_info *newinfo, *oldinfo; + struct xt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -926,38 +807,33 @@ static int do_replace(void __user *user, unsigned int len) if (len != sizeof(tmp) + tmp.size) return -ENOPROTOOPT; - /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) - return -ENOMEM; - - newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } - counters = vmalloc(tmp.num_counters * sizeof(struct arpt_counters)); + counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; duprintf("arp_tables: Translated table\n"); - t = try_then_request_module(find_table_lock(tmp.name), + t = try_then_request_module(xt_find_table_lock(NF_ARP, tmp.name), "arptable_%s", tmp.name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; @@ -972,7 +848,7 @@ static int do_replace(void __user *user, unsigned int len) goto put_module; } - oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret); if (!oldinfo) goto put_module; @@ -989,24 +865,26 @@ static int do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + + xt_free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, - sizeof(struct arpt_counters) * tmp.num_counters) != 0) + sizeof(struct xt_counters) * tmp.num_counters) != 0) ret = -EFAULT; vfree(counters); - up(&arpt_mutex); + xt_table_unlock(t); return ret; put_module: module_put(t->me); - up(&arpt_mutex); + xt_table_unlock(t); free_newinfo_counters_untrans: - ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); + ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + xt_free_table_info(newinfo); return ret; } @@ -1014,7 +892,7 @@ static int do_replace(void __user *user, unsigned int len) * and everything is OK. */ static inline int add_counter_to_entry(struct arpt_entry *e, - const struct arpt_counters addme[], + const struct xt_counters addme[], unsigned int *i) { @@ -1027,14 +905,16 @@ static inline int add_counter_to_entry(struct arpt_entry *e, static int do_add_counters(void __user *user, unsigned int len) { unsigned int i; - struct arpt_counters_info tmp, *paddc; + struct xt_counters_info tmp, *paddc; struct arpt_table *t; + struct xt_table_info *private; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; - if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct arpt_counters)) + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters)) return -EINVAL; paddc = vmalloc(len); @@ -1046,27 +926,30 @@ static int do_add_counters(void __user *user, unsigned int len) goto free; } - t = find_table_lock(tmp.name); + t = xt_find_table_lock(NF_ARP, tmp.name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; } write_lock_bh(&t->lock); - if (t->private->number != paddc->num_counters) { + private = t->private; + if (private->number != paddc->num_counters) { ret = -EINVAL; goto unlock_up_free; } i = 0; - ARPT_ENTRY_ITERATE(t->private->entries, - t->private->size, + /* Choose the copy that is on our node */ + loc_cpu_entry = private->entries[smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_entry, + private->size, add_counter_to_entry, paddc->counters, &i); unlock_up_free: write_unlock_bh(&t->lock); - up(&arpt_mutex); + xt_table_unlock(t); module_put(t->me); free: vfree(paddc); @@ -1123,25 +1006,26 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len } name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; - t = try_then_request_module(find_table_lock(name), + t = try_then_request_module(xt_find_table_lock(NF_ARP, name), "arptable_%s", name); if (t && !IS_ERR(t)) { struct arpt_getinfo info; + struct xt_table_info *private = t->private; info.valid_hooks = t->valid_hooks; - memcpy(info.hook_entry, t->private->hook_entry, + memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); - memcpy(info.underflow, t->private->underflow, + memcpy(info.underflow, private->underflow, sizeof(info.underflow)); - info.num_entries = t->private->number; - info.size = t->private->size; + info.num_entries = private->number; + info.size = private->size; strcpy(info.name, name); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; else ret = 0; - up(&arpt_mutex); + xt_table_unlock(t); module_put(t->me); } else ret = t ? PTR_ERR(t) : -ENOENT; @@ -1166,7 +1050,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len } case ARPT_SO_GET_REVISION_TARGET: { - struct arpt_get_revision rev; + struct xt_get_revision rev; if (*len != sizeof(rev)) { ret = -EINVAL; @@ -1177,8 +1061,8 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len break; } - try_then_request_module(find_revision(rev.name, rev.revision, - target_revfn, &ret), + try_then_request_module(xt_find_revision(NF_ARP, rev.name, + rev.revision, 1, &ret), "arpt_%s", rev.name); break; } @@ -1191,101 +1075,57 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len return ret; } -/* Registration hooks for targets. */ -int arpt_register_target(struct arpt_target *target) -{ - int ret; - - ret = down_interruptible(&arpt_mutex); - if (ret != 0) - return ret; - - list_add(&target->list, &arpt_target); - up(&arpt_mutex); - - return ret; -} - -void arpt_unregister_target(struct arpt_target *target) -{ - down(&arpt_mutex); - LIST_DELETE(&arpt_target, target); - up(&arpt_mutex); -} - int arpt_register_table(struct arpt_table *table, const struct arpt_replace *repl) { int ret; - struct arpt_table_info *newinfo; - static struct arpt_table_info bootstrap + struct xt_table_info *newinfo; + static struct xt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = xt_alloc_table_info(repl->size); if (!newinfo) { ret = -ENOMEM; return ret; } - memcpy(newinfo->entries, repl->entries, repl->size); + + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); + duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) { - vfree(newinfo); + xt_free_table_info(newinfo); return ret; } - ret = down_interruptible(&arpt_mutex); - if (ret != 0) { - vfree(newinfo); + if (xt_register_table(table, &bootstrap, newinfo) != 0) { + xt_free_table_info(newinfo); return ret; } - /* Don't autoload: we'd eat our tail... */ - if (list_named_find(&arpt_tables, table->name)) { - ret = -EEXIST; - goto free_unlock; - } - - /* Simplifies replace_table code. */ - table->private = &bootstrap; - if (!replace_table(table, 0, newinfo, &ret)) - goto free_unlock; - - duprintf("table->private->number = %u\n", - table->private->number); - - /* save number of initial entries */ - table->private->initial_entries = table->private->number; - - rwlock_init(&table->lock); - list_prepend(&arpt_tables, table); - - unlock: - up(&arpt_mutex); - return ret; - - free_unlock: - vfree(newinfo); - goto unlock; + return 0; } void arpt_unregister_table(struct arpt_table *table) { - down(&arpt_mutex); - LIST_DELETE(&arpt_tables, table); - up(&arpt_mutex); + struct xt_table_info *private; + void *loc_cpu_entry; + + private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ - ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); - vfree(table->private); + xt_free_table_info(private); } /* The built-in targets: standard (NULL) and error. */ @@ -1308,52 +1148,15 @@ static struct nf_sockopt_ops arpt_sockopts = { .get = do_arpt_get_ctl, }; -#ifdef CONFIG_PROC_FS -static inline int print_name(const struct arpt_table *t, - off_t start_offset, char *buffer, int length, - off_t *pos, unsigned int *count) -{ - if ((*count)++ >= start_offset) { - unsigned int namelen; - - namelen = sprintf(buffer + *pos, "%s\n", t->name); - if (*pos + namelen > length) { - /* Stop iterating */ - return 1; - } - *pos += namelen; - } - return 0; -} - -static int arpt_get_tables(char *buffer, char **start, off_t offset, int length) -{ - off_t pos = 0; - unsigned int count = 0; - - if (down_interruptible(&arpt_mutex) != 0) - return 0; - - LIST_FIND(&arpt_tables, print_name, struct arpt_table *, - offset, buffer, length, &pos, &count); - - up(&arpt_mutex); - - /* `start' hack - see fs/proc/generic.c line ~105 */ - *start=(char *)((unsigned long)count-offset); - return pos; -} -#endif /*CONFIG_PROC_FS*/ - static int __init init(void) { int ret; + xt_proto_init(NF_ARP); + /* Noone else will be downing sem now, so we won't sleep */ - down(&arpt_mutex); - list_append(&arpt_target, &arpt_standard_target); - list_append(&arpt_target, &arpt_error_target); - up(&arpt_mutex); + xt_register_target(NF_ARP, &arpt_standard_target); + xt_register_target(NF_ARP, &arpt_error_target); /* Register setsockopt */ ret = nf_register_sockopt(&arpt_sockopts); @@ -1362,19 +1165,6 @@ static int __init init(void) return ret; } -#ifdef CONFIG_PROC_FS - { - struct proc_dir_entry *proc; - - proc = proc_net_create("arp_tables_names", 0, arpt_get_tables); - if (!proc) { - nf_unregister_sockopt(&arpt_sockopts); - return -ENOMEM; - } - proc->owner = THIS_MODULE; - } -#endif - printk("arp_tables: (C) 2002 David S. Miller\n"); return 0; } @@ -1382,16 +1172,12 @@ static int __init init(void) static void __exit fini(void) { nf_unregister_sockopt(&arpt_sockopts); -#ifdef CONFIG_PROC_FS - proc_net_remove("arp_tables_names"); -#endif + xt_proto_fini(NF_ARP); } EXPORT_SYMBOL(arpt_register_table); EXPORT_SYMBOL(arpt_unregister_table); EXPORT_SYMBOL(arpt_do_table); -EXPORT_SYMBOL(arpt_register_target); -EXPORT_SYMBOL(arpt_unregister_target); module_init(init); module_exit(fini); diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index 3e592ec86482..c97650a16a5b 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -8,8 +8,9 @@ MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); MODULE_DESCRIPTION("arptables arp payload mangle target"); static unsigned int -target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in, - const struct net_device *out, const void *targinfo, void *userinfo) +target(struct sk_buff **pskb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, const void *targinfo, + void *userinfo) { const struct arpt_mangle *mangle = targinfo; struct arphdr *arp; @@ -64,7 +65,7 @@ target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in, } static int -checkentry(const char *tablename, const struct arpt_entry *e, void *targinfo, +checkentry(const char *tablename, const void *e, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) { const struct arpt_mangle *mangle = targinfo; diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 0d759f5a4ef0..f6ab45f48681 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -145,6 +145,7 @@ static struct arpt_table packet_filter = { .lock = RW_LOCK_UNLOCKED, .private = NULL, .me = THIS_MODULE, + .af = NF_ARP, }; /* The work comes in here from netfilter.c */ diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index e52847fa10f5..84e4f79b7ffa 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -18,11 +18,13 @@ * */ +#include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/ip.h> #include <linux/moduleparam.h> +#include <linux/udp.h> #include <net/checksum.h> #include <net/udp.h> @@ -34,7 +36,7 @@ static unsigned int master_timeout = 300; MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); MODULE_DESCRIPTION("Amanda connection tracking module"); MODULE_LICENSE("GPL"); -module_param(master_timeout, int, 0600); +module_param(master_timeout, uint, 0600); MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 68b173bcda60..e627e5856172 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -34,7 +34,7 @@ static int ports_c; module_param_array(ports, ushort, &ports_c, 0400); static int loose; -module_param(loose, int, 0600); +module_param(loose, bool, 0600); unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c index 4108a5e12b3c..d716bba798f2 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c @@ -762,7 +762,7 @@ static struct ip_conntrack_helper pptp = { .help = conntrack_pptp_help }; -extern void __exit ip_ct_proto_gre_fini(void); +extern void ip_ct_proto_gre_fini(void); extern int __init ip_ct_proto_gre_init(void); /* ip_conntrack_pptp initialization */ diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index d7c40421d0d1..c51a2cf71b4b 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -36,7 +36,7 @@ #define MAX_PORTS 8 static unsigned short ports[MAX_PORTS]; static int ports_c; -static int max_dcc_channels = 8; +static unsigned int max_dcc_channels = 8; static unsigned int dcc_timeout = 300; /* This is slow, but it's simple. --RR */ static char *irc_buffer; @@ -54,9 +54,9 @@ MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); MODULE_LICENSE("GPL"); module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of IRC servers"); -module_param(max_dcc_channels, int, 0400); +module_param(max_dcc_channels, uint, 0400); MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); -module_param(dcc_timeout, int, 0400); +module_param(dcc_timeout, uint, 0400); MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); static const char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; @@ -254,10 +254,6 @@ static int __init init(void) printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n"); return -EBUSY; } - if (dcc_timeout < 0) { - printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n"); - return -EBUSY; - } irc_buffer = kmalloc(65536, GFP_KERNEL); if (!irc_buffer) diff --git a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c index 186646eb249f..4e68e16a2612 100644 --- a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c +++ b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c @@ -37,7 +37,7 @@ MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper"); MODULE_LICENSE("GPL"); static unsigned int timeout = 3; -module_param(timeout, int, 0600); +module_param(timeout, uint, 0400); MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); static int help(struct sk_buff **pskb, diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 91fe8f2e38ff..c9ebbe0d2d9c 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -79,6 +79,7 @@ ctnetlink_dump_tuples(struct sk_buff *skb, const struct ip_conntrack_tuple *tuple) { struct nfattr *nest_parms; + int ret; nest_parms = NFA_NEST(skb, CTA_TUPLE_IP); NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip); @@ -86,10 +87,10 @@ ctnetlink_dump_tuples(struct sk_buff *skb, NFA_NEST_END(skb, nest_parms); nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO); - ctnetlink_dump_tuples_proto(skb, tuple); + ret = ctnetlink_dump_tuples_proto(skb, tuple); NFA_NEST_END(skb, nest_parms); - return 0; + return ret; nfattr_failure: return -1; @@ -160,7 +161,7 @@ ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct) return 0; nest_helper = NFA_NEST(skb, CTA_HELP); - NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name); + NFA_PUT(skb, CTA_HELP_NAME, strlen(ct->helper->name), ct->helper->name); if (ct->helper->to_nfattr) ct->helper->to_nfattr(skb, ct); @@ -229,7 +230,7 @@ nfattr_failure: static inline int ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct) { - unsigned int use = htonl(atomic_read(&ct->ct_general.use)); + u_int32_t use = htonl(atomic_read(&ct->ct_general.use)); NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use); return 0; @@ -311,29 +312,22 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, if (events & IPCT_DESTROY) { type = IPCTNL_MSG_CT_DELETE; group = NFNLGRP_CONNTRACK_DESTROY; - goto alloc_skb; - } - if (events & (IPCT_NEW | IPCT_RELATED)) { + } else if (events & (IPCT_NEW | IPCT_RELATED)) { type = IPCTNL_MSG_CT_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; /* dump everything */ events = ~0UL; group = NFNLGRP_CONNTRACK_NEW; - goto alloc_skb; - } - if (events & (IPCT_STATUS | + } else if (events & (IPCT_STATUS | IPCT_PROTOINFO | IPCT_HELPER | IPCT_HELPINFO | IPCT_NATINFO)) { type = IPCTNL_MSG_CT_NEW; group = NFNLGRP_CONNTRACK_UPDATE; - goto alloc_skb; - } + } else + return NOTIFY_DONE; - return NOTIFY_DONE; - -alloc_skb: /* FIXME: Check if there are any listeners before, don't hurt performance */ skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); @@ -1037,6 +1031,11 @@ ctnetlink_create_conntrack(struct nfattr *cda[], return err; } +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + ct->helper = ip_conntrack_helper_find_get(rtuple); add_timer(&ct->timeout); @@ -1045,11 +1044,6 @@ ctnetlink_create_conntrack(struct nfattr *cda[], if (ct->helper) ip_conntrack_helper_put(ct->helper); -#if defined(CONFIG_IP_NF_CONNTRACK_MARK) - if (cda[CTA_MARK-1]) - ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); -#endif - DEBUGP("conntrack with id %u inserted\n", ct->id); return 0; @@ -1209,7 +1203,6 @@ static int ctnetlink_expect_event(struct notifier_block *this, unsigned int type; unsigned char *b; int flags = 0; - u16 proto; if (events & IPEXP_NEW) { type = IPCTNL_MSG_EXP_NEW; @@ -1236,7 +1229,6 @@ static int ctnetlink_expect_event(struct notifier_block *this, goto nfattr_failure; nlh->nlmsg_len = skb->tail - b; - proto = exp->tuple.dst.protonum; nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0); return NOTIFY_DONE; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c index 88c3712bd251..f891308b5e4c 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_generic.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c @@ -12,7 +12,7 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -unsigned long ip_ct_generic_timeout = 600*HZ; +unsigned int ip_ct_generic_timeout = 600*HZ; static int generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 744abb9d377a..c777abf16cb7 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c @@ -31,6 +31,7 @@ #include <linux/ip.h> #include <linux/in.h> #include <linux/list.h> +#include <linux/seq_file.h> static DEFINE_RWLOCK(ip_ct_gre_lock); #define ASSERT_READ_LOCK(x) @@ -308,7 +309,10 @@ int __init ip_ct_proto_gre_init(void) return ip_conntrack_protocol_register(&gre); } -void __exit ip_ct_proto_gre_fini(void) +/* This cannot be __exit, as it is invoked from ip_conntrack_helper_pptp.c's + * init() code on errors. + */ +void ip_ct_proto_gre_fini(void) { struct list_head *pos, *n; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 5f9925db608e..3021af0910f1 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -16,13 +16,12 @@ #include <linux/skbuff.h> #include <net/ip.h> #include <net/checksum.h> -#include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv4/ip_conntrack.h> #include <linux/netfilter_ipv4/ip_conntrack_core.h> #include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -unsigned long ip_ct_icmp_timeout = 30*HZ; +unsigned int ip_ct_icmp_timeout = 30*HZ; #if 0 #define DEBUGP printk @@ -47,20 +46,21 @@ static int icmp_pkt_to_tuple(const struct sk_buff *skb, return 1; } +/* Add 1; spaces filled with 0. */ +static const u_int8_t invmap[] = { + [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 +}; + static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple, const struct ip_conntrack_tuple *orig) { - /* Add 1; spaces filled with 0. */ - static const u_int8_t invmap[] - = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, - [ICMP_ECHOREPLY] = ICMP_ECHO + 1, - [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, - [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, - [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, - [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, - [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, - [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; - if (orig->dst.u.icmp.type >= sizeof(invmap) || !invmap[orig->dst.u.icmp.type]) return 0; @@ -110,17 +110,17 @@ static int icmp_packet(struct ip_conntrack *ct, return NF_ACCEPT; } -static const u_int8_t valid_new[] = { - [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 -}; - /* Called when a new connection for this protocol found. */ static int icmp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) { + static const u_int8_t valid_new[] = { + [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 + }; + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ @@ -279,10 +279,6 @@ static int icmp_tuple_to_nfattr(struct sk_buff *skb, NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), &t->dst.u.icmp.code); - if (t->dst.u.icmp.type >= sizeof(valid_new) - || !valid_new[t->dst.u.icmp.type]) - return -EINVAL; - return 0; nfattr_failure: @@ -295,7 +291,7 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], if (!tb[CTA_PROTO_ICMP_TYPE-1] || !tb[CTA_PROTO_ICMP_CODE-1] || !tb[CTA_PROTO_ICMP_ID-1]) - return -1; + return -EINVAL; tuple->dst.u.icmp.type = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); @@ -304,6 +300,10 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], tuple->src.u.icmp.id = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + if (tuple->dst.u.icmp.type >= sizeof(invmap) + || !invmap[tuple->dst.u.icmp.type]) + return -EINVAL; + return 0; } #endif diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 977fb59d4563..be602e8aeab0 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -16,6 +16,7 @@ #include <linux/types.h> #include <linux/sched.h> #include <linux/timer.h> +#include <linux/interrupt.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/in.h> @@ -57,15 +58,15 @@ static const char *sctp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -static unsigned long ip_ct_sctp_timeout_closed = 10 SECS; -static unsigned long ip_ct_sctp_timeout_cookie_wait = 3 SECS; -static unsigned long ip_ct_sctp_timeout_cookie_echoed = 3 SECS; -static unsigned long ip_ct_sctp_timeout_established = 5 DAYS; -static unsigned long ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; -static unsigned long ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; -static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; +static unsigned int ip_ct_sctp_timeout_closed = 10 SECS; +static unsigned int ip_ct_sctp_timeout_cookie_wait = 3 SECS; +static unsigned int ip_ct_sctp_timeout_cookie_echoed = 3 SECS; +static unsigned int ip_ct_sctp_timeout_established = 5 DAYS; +static unsigned int ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; +static unsigned int ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; +static unsigned int ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; -static const unsigned long * sctp_timeouts[] +static const unsigned int * sctp_timeouts[] = { NULL, /* SCTP_CONNTRACK_NONE */ &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index e7fa29e576dc..e0dc37063545 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -32,7 +32,6 @@ #include <net/tcp.h> -#include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv4/ip_conntrack.h> #include <linux/netfilter_ipv4/ip_conntrack_protocol.h> @@ -85,21 +84,21 @@ static const char *tcp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -unsigned long ip_ct_tcp_timeout_syn_sent = 2 MINS; -unsigned long ip_ct_tcp_timeout_syn_recv = 60 SECS; -unsigned long ip_ct_tcp_timeout_established = 5 DAYS; -unsigned long ip_ct_tcp_timeout_fin_wait = 2 MINS; -unsigned long ip_ct_tcp_timeout_close_wait = 60 SECS; -unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS; -unsigned long ip_ct_tcp_timeout_time_wait = 2 MINS; -unsigned long ip_ct_tcp_timeout_close = 10 SECS; +unsigned int ip_ct_tcp_timeout_syn_sent = 2 MINS; +unsigned int ip_ct_tcp_timeout_syn_recv = 60 SECS; +unsigned int ip_ct_tcp_timeout_established = 5 DAYS; +unsigned int ip_ct_tcp_timeout_fin_wait = 2 MINS; +unsigned int ip_ct_tcp_timeout_close_wait = 60 SECS; +unsigned int ip_ct_tcp_timeout_last_ack = 30 SECS; +unsigned int ip_ct_tcp_timeout_time_wait = 2 MINS; +unsigned int ip_ct_tcp_timeout_close = 10 SECS; /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ -unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS; +unsigned int ip_ct_tcp_timeout_max_retrans = 5 MINS; -static const unsigned long * tcp_timeouts[] +static const unsigned int * tcp_timeouts[] = { NULL, /* TCP_CONNTRACK_NONE */ &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ @@ -995,7 +994,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, || (!test_bit(IPS_ASSURED_BIT, &conntrack->status) && conntrack->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { - /* RST sent to invalid SYN or ACK we had let trough + /* RST sent to invalid SYN or ACK we had let through * at a) and c) above: * * a) SYN was in window then @@ -1006,7 +1005,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, * segments we ignored. */ goto in_window; } - /* Just fall trough */ + /* Just fall through */ default: /* Keep compilers happy. */ break; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index f2dcac7c7660..55b7d3210adf 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -11,15 +11,15 @@ #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> +#include <linux/ip.h> #include <linux/udp.h> #include <linux/seq_file.h> #include <net/checksum.h> -#include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv4/ip_conntrack_protocol.h> -unsigned long ip_ct_udp_timeout = 30*HZ; -unsigned long ip_ct_udp_timeout_stream = 180*HZ; +unsigned int ip_ct_udp_timeout = 30*HZ; +unsigned int ip_ct_udp_timeout_stream = 180*HZ; static int udp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index dd476b191f4b..833fcb4be5e7 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -27,6 +27,7 @@ #endif #include <net/checksum.h> #include <net/ip.h> +#include <net/route.h> #define ASSERT_READ_LOCK(x) #define ASSERT_WRITE_LOCK(x) @@ -450,30 +451,6 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum, return NF_ACCEPT; } -static unsigned int ip_refrag(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct rtable *rt = (struct rtable *)(*pskb)->dst; - - /* We've seen it coming out the other side: confirm */ - if (ip_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) - return NF_DROP; - - /* Local packets are never produced too large for their - interface. We degfragment them at LOCAL_OUT, however, - so we have to refragment them here. */ - if ((*pskb)->len > dst_mtu(&rt->u.dst) && - !skb_shinfo(*pskb)->tso_size) { - /* No hook can be after us, so this should be OK. */ - ip_fragment(*pskb, okfn); - return NF_STOLEN; - } - return NF_ACCEPT; -} - static unsigned int ip_conntrack_local(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, @@ -543,7 +520,7 @@ static struct nf_hook_ops ip_conntrack_helper_in_ops = { /* Refragmenter; last chance. */ static struct nf_hook_ops ip_conntrack_out_ops = { - .hook = ip_refrag, + .hook = ip_confirm, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_POST_ROUTING, @@ -567,28 +544,28 @@ extern int ip_conntrack_max; extern unsigned int ip_conntrack_htable_size; /* From ip_conntrack_proto_tcp.c */ -extern unsigned long ip_ct_tcp_timeout_syn_sent; -extern unsigned long ip_ct_tcp_timeout_syn_recv; -extern unsigned long ip_ct_tcp_timeout_established; -extern unsigned long ip_ct_tcp_timeout_fin_wait; -extern unsigned long ip_ct_tcp_timeout_close_wait; -extern unsigned long ip_ct_tcp_timeout_last_ack; -extern unsigned long ip_ct_tcp_timeout_time_wait; -extern unsigned long ip_ct_tcp_timeout_close; -extern unsigned long ip_ct_tcp_timeout_max_retrans; +extern unsigned int ip_ct_tcp_timeout_syn_sent; +extern unsigned int ip_ct_tcp_timeout_syn_recv; +extern unsigned int ip_ct_tcp_timeout_established; +extern unsigned int ip_ct_tcp_timeout_fin_wait; +extern unsigned int ip_ct_tcp_timeout_close_wait; +extern unsigned int ip_ct_tcp_timeout_last_ack; +extern unsigned int ip_ct_tcp_timeout_time_wait; +extern unsigned int ip_ct_tcp_timeout_close; +extern unsigned int ip_ct_tcp_timeout_max_retrans; extern int ip_ct_tcp_loose; extern int ip_ct_tcp_be_liberal; extern int ip_ct_tcp_max_retrans; /* From ip_conntrack_proto_udp.c */ -extern unsigned long ip_ct_udp_timeout; -extern unsigned long ip_ct_udp_timeout_stream; +extern unsigned int ip_ct_udp_timeout; +extern unsigned int ip_ct_udp_timeout_stream; /* From ip_conntrack_proto_icmp.c */ -extern unsigned long ip_ct_icmp_timeout; +extern unsigned int ip_ct_icmp_timeout; /* From ip_conntrack_proto_icmp.c */ -extern unsigned long ip_ct_generic_timeout; +extern unsigned int ip_ct_generic_timeout; /* Log invalid packets of a given protocol */ static int log_invalid_proto_min = 0; @@ -967,7 +944,7 @@ module_exit(fini); /* Some modules need us, but don't depend directly on any symbol. They should call this. */ -void need_ip_conntrack(void) +void need_conntrack(void) { } @@ -985,7 +962,7 @@ EXPORT_SYMBOL(ip_ct_get_tuple); EXPORT_SYMBOL(invert_tuplepr); EXPORT_SYMBOL(ip_conntrack_alter_reply); EXPORT_SYMBOL(ip_conntrack_destroyed); -EXPORT_SYMBOL(need_ip_conntrack); +EXPORT_SYMBOL(need_conntrack); EXPORT_SYMBOL(ip_conntrack_helper_register); EXPORT_SYMBOL(ip_conntrack_helper_unregister); EXPORT_SYMBOL(ip_ct_iterate_cleanup); diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c index d83757a70d9f..b8daab3c64af 100644 --- a/net/ipv4/netfilter/ip_nat_ftp.c +++ b/net/ipv4/netfilter/ip_nat_ftp.c @@ -171,7 +171,7 @@ static int __init init(void) /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ static int warn_set(const char *val, struct kernel_param *kp) { - printk(KERN_INFO __stringify(KBUILD_MODNAME) + printk(KERN_INFO KBUILD_MODNAME ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); return 0; } diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c index e546203f5662..ac004895781a 100644 --- a/net/ipv4/netfilter/ip_nat_helper_pptp.c +++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c @@ -148,14 +148,14 @@ pptp_outbound_pkt(struct sk_buff **pskb, { struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info; struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - - u_int16_t msg, *cid = NULL, new_callid; + u_int16_t msg, new_callid; + unsigned int cid_off; new_callid = htons(ct_pptp_info->pns_call_id); switch (msg = ntohs(ctlh->messageType)) { case PPTP_OUT_CALL_REQUEST: - cid = &pptpReq->ocreq.callID; + cid_off = offsetof(union pptp_ctrl_union, ocreq.callID); /* FIXME: ideally we would want to reserve a call ID * here. current netfilter NAT core is not able to do * this :( For now we use TCP source port. This breaks @@ -172,10 +172,10 @@ pptp_outbound_pkt(struct sk_buff **pskb, ct_pptp_info->pns_call_id = ntohs(new_callid); break; case PPTP_IN_CALL_REPLY: - cid = &pptpReq->icreq.callID; + cid_off = offsetof(union pptp_ctrl_union, icreq.callID); break; case PPTP_CALL_CLEAR_REQUEST: - cid = &pptpReq->clrreq.callID; + cid_off = offsetof(union pptp_ctrl_union, clrreq.callID); break; default: DEBUGP("unknown outbound packet 0x%04x:%s\n", msg, @@ -197,18 +197,15 @@ pptp_outbound_pkt(struct sk_buff **pskb, /* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass * down to here */ - - IP_NF_ASSERT(cid); - DEBUGP("altering call id from 0x%04x to 0x%04x\n", - ntohs(*cid), ntohs(new_callid)); + ntohs(*(u_int16_t *)pptpReq + cid_off), ntohs(new_callid)); /* mangle packet */ if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, - (void *)cid - ((void *)ctlh - sizeof(struct pptp_pkt_hdr)), - sizeof(new_callid), - (char *)&new_callid, - sizeof(new_callid)) == 0) + cid_off + sizeof(struct pptp_pkt_hdr) + + sizeof(struct PptpControlHeader), + sizeof(new_callid), (char *)&new_callid, + sizeof(new_callid)) == 0) return NF_DROP; return NF_ACCEPT; @@ -299,31 +296,30 @@ pptp_inbound_pkt(struct sk_buff **pskb, union pptp_ctrl_union *pptpReq) { struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info; - u_int16_t msg, new_cid = 0, new_pcid, *pcid = NULL, *cid = NULL; - - int ret = NF_ACCEPT, rv; + u_int16_t msg, new_cid = 0, new_pcid; + unsigned int pcid_off, cid_off = 0; new_pcid = htons(nat_pptp_info->pns_call_id); switch (msg = ntohs(ctlh->messageType)) { case PPTP_OUT_CALL_REPLY: - pcid = &pptpReq->ocack.peersCallID; - cid = &pptpReq->ocack.callID; + pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID); + cid_off = offsetof(union pptp_ctrl_union, ocack.callID); break; case PPTP_IN_CALL_CONNECT: - pcid = &pptpReq->iccon.peersCallID; + pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID); break; case PPTP_IN_CALL_REQUEST: /* only need to nat in case PAC is behind NAT box */ - break; + return NF_ACCEPT; case PPTP_WAN_ERROR_NOTIFY: - pcid = &pptpReq->wanerr.peersCallID; + pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID); break; case PPTP_CALL_DISCONNECT_NOTIFY: - pcid = &pptpReq->disc.callID; + pcid_off = offsetof(union pptp_ctrl_union, disc.callID); break; case PPTP_SET_LINK_INFO: - pcid = &pptpReq->setlink.peersCallID; + pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID); break; default: @@ -345,35 +341,26 @@ pptp_inbound_pkt(struct sk_buff **pskb, * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */ /* mangle packet */ - IP_NF_ASSERT(pcid); DEBUGP("altering peer call id from 0x%04x to 0x%04x\n", - ntohs(*pcid), ntohs(new_pcid)); - - rv = ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, - (void *)pcid - ((void *)ctlh - sizeof(struct pptp_pkt_hdr)), - sizeof(new_pcid), (char *)&new_pcid, - sizeof(new_pcid)); - if (rv != NF_ACCEPT) - return rv; + ntohs(*(u_int16_t *)pptpReq + pcid_off), ntohs(new_pcid)); + + if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, + pcid_off + sizeof(struct pptp_pkt_hdr) + + sizeof(struct PptpControlHeader), + sizeof(new_pcid), (char *)&new_pcid, + sizeof(new_pcid)) == 0) + return NF_DROP; if (new_cid) { - IP_NF_ASSERT(cid); DEBUGP("altering call id from 0x%04x to 0x%04x\n", - ntohs(*cid), ntohs(new_cid)); - rv = ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, - (void *)cid - ((void *)ctlh - sizeof(struct pptp_pkt_hdr)), - sizeof(new_cid), - (char *)&new_cid, - sizeof(new_cid)); - if (rv != NF_ACCEPT) - return rv; + ntohs(*(u_int16_t *)pptpReq + cid_off), ntohs(new_cid)); + if (ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, + cid_off + sizeof(struct pptp_pkt_hdr) + + sizeof(struct PptpControlHeader), + sizeof(new_cid), (char *)&new_cid, + sizeof(new_cid)) == 0) + return NF_DROP; } - - /* check for earlier return value of 'switch' above */ - if (ret != NF_ACCEPT) - return ret; - - /* great, at least we don't need to resize packets */ return NF_ACCEPT; } diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c index de31942babe3..461c833eaca1 100644 --- a/net/ipv4/netfilter/ip_nat_irc.c +++ b/net/ipv4/netfilter/ip_nat_irc.c @@ -113,7 +113,7 @@ static int __init init(void) /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ static int warn_set(const char *val, struct kernel_param *kp) { - printk(KERN_INFO __stringify(KBUILD_MODNAME) + printk(KERN_INFO KBUILD_MODNAME ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); return 0; } diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c index f7cad7cf1aec..6c4899d8046a 100644 --- a/net/ipv4/netfilter/ip_nat_proto_gre.c +++ b/net/ipv4/netfilter/ip_nat_proto_gre.c @@ -151,42 +151,6 @@ gre_manip_pkt(struct sk_buff **pskb, return 1; } -/* print out a nat tuple */ -static unsigned int -gre_print(char *buffer, - const struct ip_conntrack_tuple *match, - const struct ip_conntrack_tuple *mask) -{ - unsigned int len = 0; - - if (mask->src.u.gre.key) - len += sprintf(buffer + len, "srckey=0x%x ", - ntohl(match->src.u.gre.key)); - - if (mask->dst.u.gre.key) - len += sprintf(buffer + len, "dstkey=0x%x ", - ntohl(match->src.u.gre.key)); - - return len; -} - -/* print a range of keys */ -static unsigned int -gre_print_range(char *buffer, const struct ip_nat_range *range) -{ - if (range->min.gre.key != 0 - || range->max.gre.key != 0xFFFF) { - if (range->min.gre.key == range->max.gre.key) - return sprintf(buffer, "key 0x%x ", - ntohl(range->min.gre.key)); - else - return sprintf(buffer, "keys 0x%u-0x%u ", - ntohl(range->min.gre.key), - ntohl(range->max.gre.key)); - } else - return 0; -} - /* nat helper struct */ static struct ip_nat_protocol gre = { .name = "GRE", @@ -194,8 +158,6 @@ static struct ip_nat_protocol gre = { .manip_pkt = gre_manip_pkt, .in_range = gre_in_range, .unique_tuple = gre_unique_tuple, - .print = gre_print, - .print_range = gre_print_range, #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) .range_to_nfattr = ip_nat_port_range_to_nfattr, diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c index 938719043999..31a3f4ccb99c 100644 --- a/net/ipv4/netfilter/ip_nat_proto_icmp.c +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -74,38 +74,6 @@ icmp_manip_pkt(struct sk_buff **pskb, return 1; } -static unsigned int -icmp_print(char *buffer, - const struct ip_conntrack_tuple *match, - const struct ip_conntrack_tuple *mask) -{ - unsigned int len = 0; - - if (mask->src.u.icmp.id) - len += sprintf(buffer + len, "id=%u ", - ntohs(match->src.u.icmp.id)); - - if (mask->dst.u.icmp.type) - len += sprintf(buffer + len, "type=%u ", - ntohs(match->dst.u.icmp.type)); - - if (mask->dst.u.icmp.code) - len += sprintf(buffer + len, "code=%u ", - ntohs(match->dst.u.icmp.code)); - - return len; -} - -static unsigned int -icmp_print_range(char *buffer, const struct ip_nat_range *range) -{ - if (range->min.icmp.id != 0 || range->max.icmp.id != 0xFFFF) - return sprintf(buffer, "id %u-%u ", - ntohs(range->min.icmp.id), - ntohs(range->max.icmp.id)); - else return 0; -} - struct ip_nat_protocol ip_nat_protocol_icmp = { .name = "ICMP", .protonum = IPPROTO_ICMP, @@ -113,8 +81,6 @@ struct ip_nat_protocol ip_nat_protocol_icmp = { .manip_pkt = icmp_manip_pkt, .in_range = icmp_in_range, .unique_tuple = icmp_unique_tuple, - .print = icmp_print, - .print_range = icmp_print_range, #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) .range_to_nfattr = ip_nat_port_range_to_nfattr, diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c index 1d381bf68574..a3d14079eba6 100644 --- a/net/ipv4/netfilter/ip_nat_proto_tcp.c +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -136,40 +136,6 @@ tcp_manip_pkt(struct sk_buff **pskb, return 1; } -static unsigned int -tcp_print(char *buffer, - const struct ip_conntrack_tuple *match, - const struct ip_conntrack_tuple *mask) -{ - unsigned int len = 0; - - if (mask->src.u.tcp.port) - len += sprintf(buffer + len, "srcpt=%u ", - ntohs(match->src.u.tcp.port)); - - - if (mask->dst.u.tcp.port) - len += sprintf(buffer + len, "dstpt=%u ", - ntohs(match->dst.u.tcp.port)); - - return len; -} - -static unsigned int -tcp_print_range(char *buffer, const struct ip_nat_range *range) -{ - if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) { - if (range->min.tcp.port == range->max.tcp.port) - return sprintf(buffer, "port %u ", - ntohs(range->min.tcp.port)); - else - return sprintf(buffer, "ports %u-%u ", - ntohs(range->min.tcp.port), - ntohs(range->max.tcp.port)); - } - else return 0; -} - struct ip_nat_protocol ip_nat_protocol_tcp = { .name = "TCP", .protonum = IPPROTO_TCP, @@ -177,8 +143,6 @@ struct ip_nat_protocol ip_nat_protocol_tcp = { .manip_pkt = tcp_manip_pkt, .in_range = tcp_in_range, .unique_tuple = tcp_unique_tuple, - .print = tcp_print, - .print_range = tcp_print_range, #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) .range_to_nfattr = ip_nat_port_range_to_nfattr, diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c index c4906e1aa24a..ec6053fdc867 100644 --- a/net/ipv4/netfilter/ip_nat_proto_udp.c +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -122,40 +122,6 @@ udp_manip_pkt(struct sk_buff **pskb, return 1; } -static unsigned int -udp_print(char *buffer, - const struct ip_conntrack_tuple *match, - const struct ip_conntrack_tuple *mask) -{ - unsigned int len = 0; - - if (mask->src.u.udp.port) - len += sprintf(buffer + len, "srcpt=%u ", - ntohs(match->src.u.udp.port)); - - - if (mask->dst.u.udp.port) - len += sprintf(buffer + len, "dstpt=%u ", - ntohs(match->dst.u.udp.port)); - - return len; -} - -static unsigned int -udp_print_range(char *buffer, const struct ip_nat_range *range) -{ - if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) { - if (range->min.udp.port == range->max.udp.port) - return sprintf(buffer, "port %u ", - ntohs(range->min.udp.port)); - else - return sprintf(buffer, "ports %u-%u ", - ntohs(range->min.udp.port), - ntohs(range->max.udp.port)); - } - else return 0; -} - struct ip_nat_protocol ip_nat_protocol_udp = { .name = "UDP", .protonum = IPPROTO_UDP, @@ -163,8 +129,6 @@ struct ip_nat_protocol ip_nat_protocol_udp = { .manip_pkt = udp_manip_pkt, .in_range = udp_in_range, .unique_tuple = udp_unique_tuple, - .print = udp_print, - .print_range = udp_print_range, #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) .range_to_nfattr = ip_nat_port_range_to_nfattr, diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index f0099a646a0b..3bf049517246 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -46,26 +46,10 @@ unknown_manip_pkt(struct sk_buff **pskb, return 1; } -static unsigned int -unknown_print(char *buffer, - const struct ip_conntrack_tuple *match, - const struct ip_conntrack_tuple *mask) -{ - return 0; -} - -static unsigned int -unknown_print_range(char *buffer, const struct ip_nat_range *range) -{ - return 0; -} - struct ip_nat_protocol ip_nat_unknown_protocol = { .name = "unknown", /* .me isn't set: getting a ref to this cannot fail. */ .manip_pkt = unknown_manip_pkt, .in_range = unknown_in_range, .unique_tuple = unknown_unique_tuple, - .print = unknown_print, - .print_range = unknown_print_range }; diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c index cb66b8bddeb3..1de86282d232 100644 --- a/net/ipv4/netfilter/ip_nat_rule.c +++ b/net/ipv4/netfilter/ip_nat_rule.c @@ -95,6 +95,7 @@ static struct ipt_table nat_table = { .valid_hooks = NAT_VALID_HOOKS, .lock = RW_LOCK_UNLOCKED, .me = THIS_MODULE, + .af = AF_INET, }; /* Source NAT */ @@ -168,7 +169,7 @@ static unsigned int ipt_dnat_target(struct sk_buff **pskb, } static int ipt_snat_checkentry(const char *tablename, - const struct ipt_entry *e, + const void *entry, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) @@ -201,7 +202,7 @@ static int ipt_snat_checkentry(const char *tablename, } static int ipt_dnat_checkentry(const char *tablename, - const struct ipt_entry *e, + const void *entry, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 8acb7ed40b47..4f95d477805c 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -44,6 +44,7 @@ * */ #include <linux/config.h> +#include <linux/in.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> @@ -53,6 +54,7 @@ #include <linux/netfilter_ipv4/ip_conntrack_helper.h> #include <linux/netfilter_ipv4/ip_nat_helper.h> #include <linux/ip.h> +#include <linux/udp.h> #include <net/checksum.h> #include <net/udp.h> #include <asm/uaccess.h> diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 30cd4e18c129..ad438fb185b8 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -55,6 +55,44 @@ : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN" \ : "*ERROR*"))) +#ifdef CONFIG_XFRM +static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) +{ + struct ip_conntrack *ct; + struct ip_conntrack_tuple *t; + enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + unsigned long statusbit; + + ct = ip_conntrack_get(skb, &ctinfo); + if (ct == NULL) + return; + dir = CTINFO2DIR(ctinfo); + t = &ct->tuplehash[dir].tuple; + + if (dir == IP_CT_DIR_ORIGINAL) + statusbit = IPS_DST_NAT; + else + statusbit = IPS_SRC_NAT; + + if (ct->status & statusbit) { + fl->fl4_dst = t->dst.ip; + if (t->dst.protonum == IPPROTO_TCP || + t->dst.protonum == IPPROTO_UDP) + fl->fl_ip_dport = t->dst.u.tcp.port; + } + + statusbit ^= IPS_NAT_MASK; + + if (ct->status & statusbit) { + fl->fl4_src = t->src.ip; + if (t->dst.protonum == IPPROTO_TCP || + t->dst.protonum == IPPROTO_UDP) + fl->fl_ip_sport = t->src.u.tcp.port; + } +} +#endif + static unsigned int ip_nat_fn(unsigned int hooknum, struct sk_buff **pskb, @@ -162,18 +200,20 @@ ip_nat_in(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - u_int32_t saddr, daddr; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; unsigned int ret; - saddr = (*pskb)->nh.iph->saddr; - daddr = (*pskb)->nh.iph->daddr; - ret = ip_nat_fn(hooknum, pskb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN - && ((*pskb)->nh.iph->saddr != saddr - || (*pskb)->nh.iph->daddr != daddr)) { - dst_release((*pskb)->dst); - (*pskb)->dst = NULL; + && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.src.ip != + ct->tuplehash[!dir].tuple.dst.ip) { + dst_release((*pskb)->dst); + (*pskb)->dst = NULL; + } } return ret; } @@ -185,29 +225,30 @@ ip_nat_out(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + unsigned int ret; + /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) return NF_ACCEPT; - /* We can hit fragment here; forwarded packets get - defragmented by connection tracking coming in, then - fragmented (grr) by the forward code. - - In future: If we have nfct != NULL, AND we have NAT - initialized, AND there is no helper, then we can do full - NAPT on the head, and IP-address-only NAT on the rest. - - I'm starting to have nightmares about fragments. */ - - if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { - *pskb = ip_ct_gather_frags(*pskb, IP_DEFRAG_NAT_OUT); - - if (!*pskb) - return NF_STOLEN; + ret = ip_nat_fn(hooknum, pskb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN + && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.src.ip != + ct->tuplehash[!dir].tuple.dst.ip +#ifdef CONFIG_XFRM + || ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all +#endif + ) + return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; } - - return ip_nat_fn(hooknum, pskb, in, out, okfn); + return ret; } static unsigned int @@ -217,7 +258,8 @@ ip_nat_local_fn(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - u_int32_t saddr, daddr; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; unsigned int ret; /* root is playing with raw sockets. */ @@ -225,14 +267,20 @@ ip_nat_local_fn(unsigned int hooknum, || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) return NF_ACCEPT; - saddr = (*pskb)->nh.iph->saddr; - daddr = (*pskb)->nh.iph->daddr; - ret = ip_nat_fn(hooknum, pskb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN - && ((*pskb)->nh.iph->saddr != saddr - || (*pskb)->nh.iph->daddr != daddr)) - return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.dst.ip != + ct->tuplehash[!dir].tuple.src.ip +#ifdef CONFIG_XFRM + || ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[dir].tuple.src.u.all +#endif + ) + return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + } return ret; } @@ -316,14 +364,18 @@ static int init_or_cleanup(int init) { int ret = 0; - need_ip_conntrack(); + need_conntrack(); if (!init) goto cleanup; +#ifdef CONFIG_XFRM + BUG_ON(ip_nat_decode_session != NULL); + ip_nat_decode_session = nat_decode_session; +#endif ret = ip_nat_rule_init(); if (ret < 0) { printk("ip_nat_init: can't setup rules.\n"); - goto cleanup_nothing; + goto cleanup_decode_session; } ret = nf_register_hook(&ip_nat_in_ops); if (ret < 0) { @@ -371,7 +423,11 @@ static int init_or_cleanup(int init) nf_unregister_hook(&ip_nat_in_ops); cleanup_rule_init: ip_nat_rule_cleanup(); - cleanup_nothing: + cleanup_decode_session: +#ifdef CONFIG_XFRM + ip_nat_decode_session = NULL; + synchronize_net(); +#endif return ret; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 45886c8475e8..2371b2062c2d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -2,7 +2,7 @@ * Packet matching code. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling - * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -11,16 +11,17 @@ * 19 Jan 2002 Harald Welte <laforge@gnumonks.org> * - increase module usage count as soon as we have rules inside * a table + * 08 Oct 2005 Harald Welte <lafore@netfilter.org> + * - Generalize into "x_tables" layer and "{ip,ip6,arp}_tables" */ #include <linux/config.h> #include <linux/cache.h> +#include <linux/capability.h> #include <linux/skbuff.h> #include <linux/kmod.h> #include <linux/vmalloc.h> #include <linux/netdevice.h> #include <linux/module.h> -#include <linux/tcp.h> -#include <linux/udp.h> #include <linux/icmp.h> #include <net/ip.h> #include <asm/uaccess.h> @@ -29,6 +30,7 @@ #include <linux/err.h> #include <linux/cpumask.h> +#include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> MODULE_LICENSE("GPL"); @@ -61,14 +63,6 @@ do { \ #else #define IP_NF_ASSERT(x) #endif -#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) - -static DECLARE_MUTEX(ipt_mutex); - -/* Must have mutex */ -#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) -#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) -#include <linux/netfilter_ipv4/listhelp.h> #if 0 /* All the better to debug you with... */ @@ -83,48 +77,8 @@ static DECLARE_MUTEX(ipt_mutex); context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ -/* The table itself */ -struct ipt_table_info -{ - /* Size per table */ - unsigned int size; - /* Number of entries: FIXME. --RR */ - unsigned int number; - /* Initial number of entries. Needed for module usage count */ - unsigned int initial_entries; - - /* Entry points and underflows */ - unsigned int hook_entry[NF_IP_NUMHOOKS]; - unsigned int underflow[NF_IP_NUMHOOKS]; - - /* ipt_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; -}; - -static LIST_HEAD(ipt_target); -static LIST_HEAD(ipt_match); -static LIST_HEAD(ipt_tables); -#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) - -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - -#if 0 -#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) -#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) -#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0) -#endif - /* Returns whether matches rule or not. */ static inline int ip_packet_match(const struct iphdr *ip, @@ -243,7 +197,8 @@ int do_match(struct ipt_entry_match *m, int *hotdrop) { /* Stop iteration if it doesn't match */ - if (!m->u.kernel.match->match(skb, in, out, m->data, offset, hotdrop)) + if (!m->u.kernel.match->match(skb, in, out, m->data, offset, + skb->nh.iph->ihl*4, hotdrop)) return 1; else return 0; @@ -274,6 +229,7 @@ ipt_do_table(struct sk_buff **pskb, const char *indev, *outdev; void *table_base; struct ipt_entry *e, *back; + struct xt_table_info *private = table->private; /* Initialization */ ip = (*pskb)->nh.iph; @@ -290,25 +246,11 @@ ipt_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); - e = get_entry(table_base, table->private->hook_entry[hook]); - -#ifdef CONFIG_NETFILTER_DEBUG - /* Check noone else using our table */ - if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac - && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) { - printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", - smp_processor_id(), - table->name, - &((struct ipt_entry *)table_base)->comefrom, - ((struct ipt_entry *)table_base)->comefrom); - } - ((struct ipt_entry *)table_base)->comefrom = 0x57acc001; -#endif + table_base = (void *)private->entries[smp_processor_id()]; + e = get_entry(table_base, private->hook_entry[hook]); /* For return from builtin chain */ - back = get_entry(table_base, table->private->underflow[hook]); + back = get_entry(table_base, private->underflow[hook]); do { IP_NF_ASSERT(e); @@ -394,9 +336,6 @@ ipt_do_table(struct sk_buff **pskb, } } while (!hotdrop); -#ifdef CONFIG_NETFILTER_DEBUG - ((struct ipt_entry *)table_base)->comefrom = 0xdead57ac; -#endif read_unlock_bh(&table->lock); #ifdef DEBUG_ALLOW_ALL @@ -408,145 +347,6 @@ ipt_do_table(struct sk_buff **pskb, #endif } -/* - * These are weird, but module loading must not be done with mutex - * held (since they will register), and we have to have a single - * function to use try_then_request_module(). - */ - -/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ -static inline struct ipt_table *find_table_lock(const char *name) -{ - struct ipt_table *t; - - if (down_interruptible(&ipt_mutex) != 0) - return ERR_PTR(-EINTR); - - list_for_each_entry(t, &ipt_tables, list) - if (strcmp(t->name, name) == 0 && try_module_get(t->me)) - return t; - up(&ipt_mutex); - return NULL; -} - -/* Find match, grabs ref. Returns ERR_PTR() on error. */ -static inline struct ipt_match *find_match(const char *name, u8 revision) -{ - struct ipt_match *m; - int err = 0; - - if (down_interruptible(&ipt_mutex) != 0) - return ERR_PTR(-EINTR); - - list_for_each_entry(m, &ipt_match, list) { - if (strcmp(m->name, name) == 0) { - if (m->revision == revision) { - if (try_module_get(m->me)) { - up(&ipt_mutex); - return m; - } - } else - err = -EPROTOTYPE; /* Found something. */ - } - } - up(&ipt_mutex); - return ERR_PTR(err); -} - -/* Find target, grabs ref. Returns ERR_PTR() on error. */ -static inline struct ipt_target *find_target(const char *name, u8 revision) -{ - struct ipt_target *t; - int err = 0; - - if (down_interruptible(&ipt_mutex) != 0) - return ERR_PTR(-EINTR); - - list_for_each_entry(t, &ipt_target, list) { - if (strcmp(t->name, name) == 0) { - if (t->revision == revision) { - if (try_module_get(t->me)) { - up(&ipt_mutex); - return t; - } - } else - err = -EPROTOTYPE; /* Found something. */ - } - } - up(&ipt_mutex); - return ERR_PTR(err); -} - -struct ipt_target *ipt_find_target(const char *name, u8 revision) -{ - struct ipt_target *target; - - target = try_then_request_module(find_target(name, revision), - "ipt_%s", name); - if (IS_ERR(target) || !target) - return NULL; - return target; -} - -static int match_revfn(const char *name, u8 revision, int *bestp) -{ - struct ipt_match *m; - int have_rev = 0; - - list_for_each_entry(m, &ipt_match, list) { - if (strcmp(m->name, name) == 0) { - if (m->revision > *bestp) - *bestp = m->revision; - if (m->revision == revision) - have_rev = 1; - } - } - return have_rev; -} - -static int target_revfn(const char *name, u8 revision, int *bestp) -{ - struct ipt_target *t; - int have_rev = 0; - - list_for_each_entry(t, &ipt_target, list) { - if (strcmp(t->name, name) == 0) { - if (t->revision > *bestp) - *bestp = t->revision; - if (t->revision == revision) - have_rev = 1; - } - } - return have_rev; -} - -/* Returns true or false (if no such extension at all) */ -static inline int find_revision(const char *name, u8 revision, - int (*revfn)(const char *, u8, int *), - int *err) -{ - int have_rev, best = -1; - - if (down_interruptible(&ipt_mutex) != 0) { - *err = -EINTR; - return 1; - } - have_rev = revfn(name, revision, &best); - up(&ipt_mutex); - - /* Nothing at all? Return 0 to try loading module. */ - if (best == -1) { - *err = -ENOENT; - return 0; - } - - *err = best; - if (!have_rev) - *err = -EPROTONOSUPPORT; - return 1; -} - - /* All zeroes == unconditional rule. */ static inline int unconditional(const struct ipt_ip *ip) @@ -563,7 +363,8 @@ unconditional(const struct ipt_ip *ip) /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct xt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -572,7 +373,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ipt_entry *e - = (struct ipt_entry *)(newinfo->entries + pos); + = (struct ipt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -622,13 +423,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) goto next; e = (struct ipt_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ipt_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -645,7 +446,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) newpos = pos + e->next_offset; } e = (struct ipt_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -708,7 +509,7 @@ check_match(struct ipt_entry_match *m, { struct ipt_match *match; - match = try_then_request_module(find_match(m->u.user.name, + match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, m->u.user.revision), "ipt_%s", m->u.user.name); if (IS_ERR(match) || !match) { @@ -753,7 +554,8 @@ check_entry(struct ipt_entry *e, const char *name, unsigned int size, goto cleanup_matches; t = ipt_get_target(e); - target = try_then_request_module(find_target(t->u.user.name, + target = try_then_request_module(xt_find_target(AF_INET, + t->u.user.name, t->u.user.revision), "ipt_%s", t->u.user.name); if (IS_ERR(target) || !target) { @@ -790,7 +592,7 @@ check_entry(struct ipt_entry *e, const char *name, unsigned int size, static inline int check_entry_size_and_hooks(struct ipt_entry *e, - struct ipt_table_info *newinfo, + struct xt_table_info *newinfo, unsigned char *base, unsigned char *limit, const unsigned int *hook_entries, @@ -824,7 +626,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, < 0 (not IPT_RETURN). --RR */ /* Clear counters and comefrom */ - e->counters = ((struct ipt_counters) { 0, 0 }); + e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; (*i)++; @@ -854,7 +656,8 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i) static int translate_table(const char *name, unsigned int valid_hooks, - struct ipt_table_info *newinfo, + struct xt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -875,11 +678,11 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -907,95 +710,79 @@ translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; } -static struct ipt_table_info * -replace_table(struct ipt_table *table, - unsigned int num_counters, - struct ipt_table_info *newinfo, - int *error) +/* Gets counters. */ +static inline int +add_entry_to_counter(const struct ipt_entry *e, + struct xt_counters total[], + unsigned int *i) { - struct ipt_table_info *oldinfo; - -#ifdef CONFIG_NETFILTER_DEBUG - { - struct ipt_entry *table_base; - unsigned int i; - - for_each_cpu(i) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; - } - } -#endif - - /* Do the substitution. */ - write_lock_bh(&table->lock); - /* Check inside lock: is the old number correct? */ - if (num_counters != table->private->number) { - duprintf("num_counters != table->private->number (%u/%u)\n", - num_counters, table->private->number); - write_unlock_bh(&table->lock); - *error = -EAGAIN; - return NULL; - } - oldinfo = table->private; - table->private = newinfo; - newinfo->initial_entries = oldinfo->initial_entries; - write_unlock_bh(&table->lock); + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - return oldinfo; + (*i)++; + return 0; } -/* Gets counters. */ static inline int -add_entry_to_counter(const struct ipt_entry *e, +set_entry_to_counter(const struct ipt_entry *e, struct ipt_counters total[], unsigned int *i) { - ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); (*i)++; return 0; } static void -get_counters(const struct ipt_table_info *t, - struct ipt_counters counters[]) +get_counters(const struct xt_table_info *t, + struct xt_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + IPT_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -1010,26 +797,32 @@ copy_entries_to_user(unsigned int total_size, { unsigned int off, num, countersize; struct ipt_entry *e; - struct ipt_counters *counters; + struct xt_counters *counters; + struct xt_table_info *private = table->private; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ - countersize = sizeof(struct ipt_counters) * table->private->number; - counters = vmalloc(countersize); + countersize = sizeof(struct xt_counters) * private->number; + counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); - get_counters(table->private, counters); + get_counters(private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1041,7 +834,7 @@ copy_entries_to_user(unsigned int total_size, struct ipt_entry_match *m; struct ipt_entry_target *t; - e = (struct ipt_entry *)(table->private->entries + off); + e = (struct ipt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ipt_entry, counters), &counters[num], @@ -1089,21 +882,22 @@ get_entries(const struct ipt_get_entries *entries, int ret; struct ipt_table *t; - t = find_table_lock(entries->name); + t = xt_find_table_lock(AF_INET, entries->name); if (t && !IS_ERR(t)) { + struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", - t->private->number); - if (entries->size == t->private->size) - ret = copy_entries_to_user(t->private->size, + private->number); + if (entries->size == private->size) + ret = copy_entries_to_user(private->size, t, uptr->entrytable); else { duprintf("get_entries: I've got %u not %u!\n", - t->private->size, + private->size, entries->size); ret = -EINVAL; } module_put(t->me); - up(&ipt_mutex); + xt_table_unlock(t); } else ret = t ? PTR_ERR(t) : -ENOENT; @@ -1116,8 +910,9 @@ do_replace(void __user *user, unsigned int len) int ret; struct ipt_replace tmp; struct ipt_table *t; - struct ipt_table_info *newinfo, *oldinfo; - struct ipt_counters *counters; + struct xt_table_info *newinfo, *oldinfo; + struct xt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1126,38 +921,33 @@ do_replace(void __user *user, unsigned int len) if (len != sizeof(tmp) + tmp.size) return -ENOPROTOOPT; - /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) - return -ENOMEM; - - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } - counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters)); + counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; duprintf("ip_tables: Translated table\n"); - t = try_then_request_module(find_table_lock(tmp.name), + t = try_then_request_module(xt_find_table_lock(AF_INET, tmp.name), "iptable_%s", tmp.name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; @@ -1172,7 +962,7 @@ do_replace(void __user *user, unsigned int len) goto put_module; } - oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret); if (!oldinfo) goto put_module; @@ -1189,24 +979,25 @@ do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + xt_free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, - sizeof(struct ipt_counters) * tmp.num_counters) != 0) + sizeof(struct xt_counters) * tmp.num_counters) != 0) ret = -EFAULT; vfree(counters); - up(&ipt_mutex); + xt_table_unlock(t); return ret; put_module: module_put(t->me); - up(&ipt_mutex); + xt_table_unlock(t); free_newinfo_counters_untrans: - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + xt_free_table_info(newinfo); return ret; } @@ -1214,7 +1005,7 @@ do_replace(void __user *user, unsigned int len) * and everything is OK. */ static inline int add_counter_to_entry(struct ipt_entry *e, - const struct ipt_counters addme[], + const struct xt_counters addme[], unsigned int *i) { #if 0 @@ -1236,17 +1027,19 @@ static int do_add_counters(void __user *user, unsigned int len) { unsigned int i; - struct ipt_counters_info tmp, *paddc; + struct xt_counters_info tmp, *paddc; struct ipt_table *t; + struct xt_table_info *private; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; - if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc(len); + paddc = vmalloc_node(len, numa_node_id()); if (!paddc) return -ENOMEM; @@ -1255,27 +1048,30 @@ do_add_counters(void __user *user, unsigned int len) goto free; } - t = find_table_lock(tmp.name); + t = xt_find_table_lock(AF_INET, tmp.name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; } write_lock_bh(&t->lock); - if (t->private->number != paddc->num_counters) { + private = t->private; + if (private->number != paddc->num_counters) { ret = -EINVAL; goto unlock_up_free; } i = 0; - IPT_ENTRY_ITERATE(t->private->entries, - t->private->size, + /* Choose the copy that is on our node */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, + private->size, add_counter_to_entry, paddc->counters, &i); unlock_up_free: write_unlock_bh(&t->lock); - up(&ipt_mutex); + xt_table_unlock(t); module_put(t->me); free: vfree(paddc); @@ -1334,25 +1130,26 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } name[IPT_TABLE_MAXNAMELEN-1] = '\0'; - t = try_then_request_module(find_table_lock(name), + t = try_then_request_module(xt_find_table_lock(AF_INET, name), "iptable_%s", name); if (t && !IS_ERR(t)) { struct ipt_getinfo info; + struct xt_table_info *private = t->private; info.valid_hooks = t->valid_hooks; - memcpy(info.hook_entry, t->private->hook_entry, + memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); - memcpy(info.underflow, t->private->underflow, + memcpy(info.underflow, private->underflow, sizeof(info.underflow)); - info.num_entries = t->private->number; - info.size = t->private->size; + info.num_entries = private->number; + info.size = private->size; memcpy(info.name, name, sizeof(info.name)); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; else ret = 0; - up(&ipt_mutex); + xt_table_unlock(t); module_put(t->me); } else ret = t ? PTR_ERR(t) : -ENOENT; @@ -1379,7 +1176,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IPT_SO_GET_REVISION_MATCH: case IPT_SO_GET_REVISION_TARGET: { struct ipt_get_revision rev; - int (*revfn)(const char *, u8, int *); + int target; if (*len != sizeof(rev)) { ret = -EINVAL; @@ -1391,12 +1188,13 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } if (cmd == IPT_SO_GET_REVISION_TARGET) - revfn = target_revfn; + target = 1; else - revfn = match_revfn; + target = 0; - try_then_request_module(find_revision(rev.name, rev.revision, - revfn, &ret), + try_then_request_module(xt_find_revision(AF_INET, rev.name, + rev.revision, + target, &ret), "ipt_%s", rev.name); break; } @@ -1409,309 +1207,53 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -/* Registration hooks for targets. */ -int -ipt_register_target(struct ipt_target *target) +int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl) { int ret; - - ret = down_interruptible(&ipt_mutex); - if (ret != 0) - return ret; - list_add(&target->list, &ipt_target); - up(&ipt_mutex); - return ret; -} - -void -ipt_unregister_target(struct ipt_target *target) -{ - down(&ipt_mutex); - LIST_DELETE(&ipt_target, target); - up(&ipt_mutex); -} - -int -ipt_register_match(struct ipt_match *match) -{ - int ret; - - ret = down_interruptible(&ipt_mutex); - if (ret != 0) - return ret; - - list_add(&match->list, &ipt_match); - up(&ipt_mutex); - - return ret; -} - -void -ipt_unregister_match(struct ipt_match *match) -{ - down(&ipt_mutex); - LIST_DELETE(&ipt_match, match); - up(&ipt_mutex); -} - -int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) -{ - int ret; - struct ipt_table_info *newinfo; - static struct ipt_table_info bootstrap + struct xt_table_info *newinfo; + static struct xt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = xt_alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - memcpy(newinfo->entries, repl->entries, repl->size); + /* choose the copy on our node/cpu + * but dont care of preemption + */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + xt_free_table_info(newinfo); return ret; } - ret = down_interruptible(&ipt_mutex); - if (ret != 0) { - vfree(newinfo); + if (xt_register_table(table, &bootstrap, newinfo) != 0) { + xt_free_table_info(newinfo); return ret; } - /* Don't autoload: we'd eat our tail... */ - if (list_named_find(&ipt_tables, table->name)) { - ret = -EEXIST; - goto free_unlock; - } - - /* Simplifies replace_table code. */ - table->private = &bootstrap; - if (!replace_table(table, 0, newinfo, &ret)) - goto free_unlock; - - duprintf("table->private->number = %u\n", - table->private->number); - - /* save number of initial entries */ - table->private->initial_entries = table->private->number; - - rwlock_init(&table->lock); - list_prepend(&ipt_tables, table); - - unlock: - up(&ipt_mutex); - return ret; - - free_unlock: - vfree(newinfo); - goto unlock; + return 0; } void ipt_unregister_table(struct ipt_table *table) { - down(&ipt_mutex); - LIST_DELETE(&ipt_tables, table); - up(&ipt_mutex); + struct xt_table_info *private; + void *loc_cpu_entry; - /* Decrease module usage counts and free resources */ - IPT_ENTRY_ITERATE(table->private->entries, table->private->size, - cleanup_entry, NULL); - vfree(table->private); -} - -/* Returns 1 if the port is matched by the range, 0 otherwise */ -static inline int -port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) -{ - int ret; - - ret = (port >= min && port <= max) ^ invert; - return ret; -} - -static int -tcp_find_option(u_int8_t option, - const struct sk_buff *skb, - unsigned int optlen, - int invert, - int *hotdrop) -{ - /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ - u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; - unsigned int i; - - duprintf("tcp_match: finding option\n"); - - if (!optlen) - return invert; - - /* If we don't have the whole header, drop packet. */ - op = skb_header_pointer(skb, - skb->nh.iph->ihl*4 + sizeof(struct tcphdr), - optlen, _opt); - if (op == NULL) { - *hotdrop = 1; - return 0; - } - - for (i = 0; i < optlen; ) { - if (op[i] == option) return !invert; - if (op[i] < 2) i++; - else i += op[i+1]?:1; - } - - return invert; -} - -static int -tcp_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const void *matchinfo, - int offset, - int *hotdrop) -{ - struct tcphdr _tcph, *th; - const struct ipt_tcp *tcpinfo = matchinfo; - - if (offset) { - /* To quote Alan: - - Don't allow a fragment of TCP 8 bytes in. Nobody normal - causes this. Its a cracker trying to break in by doing a - flag overwrite to pass the direction checks. - */ - if (offset == 1) { - duprintf("Dropping evil TCP offset=1 frag.\n"); - *hotdrop = 1; - } - /* Must not be a fragment. */ - return 0; - } - -#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) - - th = skb_header_pointer(skb, skb->nh.iph->ihl*4, - sizeof(_tcph), &_tcph); - if (th == NULL) { - /* We've been asked to examine this packet, and we - can't. Hence, no choice but to drop. */ - duprintf("Dropping evil TCP offset=0 tinygram.\n"); - *hotdrop = 1; - return 0; - } - - if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], - ntohs(th->source), - !!(tcpinfo->invflags & IPT_TCP_INV_SRCPT))) - return 0; - if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], - ntohs(th->dest), - !!(tcpinfo->invflags & IPT_TCP_INV_DSTPT))) - return 0; - if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) - == tcpinfo->flg_cmp, - IPT_TCP_INV_FLAGS)) - return 0; - if (tcpinfo->option) { - if (th->doff * 4 < sizeof(_tcph)) { - *hotdrop = 1; - return 0; - } - if (!tcp_find_option(tcpinfo->option, skb, - th->doff*4 - sizeof(_tcph), - tcpinfo->invflags & IPT_TCP_INV_OPTION, - hotdrop)) - return 0; - } - return 1; -} - -/* Called when user tries to insert an entry of this type. */ -static int -tcp_checkentry(const char *tablename, - const struct ipt_ip *ip, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) -{ - const struct ipt_tcp *tcpinfo = matchinfo; - - /* Must specify proto == TCP, and no unknown invflags */ - return ip->proto == IPPROTO_TCP - && !(ip->invflags & IPT_INV_PROTO) - && matchsize == IPT_ALIGN(sizeof(struct ipt_tcp)) - && !(tcpinfo->invflags & ~IPT_TCP_INV_MASK); -} - -static int -udp_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const void *matchinfo, - int offset, - int *hotdrop) -{ - struct udphdr _udph, *uh; - const struct ipt_udp *udpinfo = matchinfo; - - /* Must not be a fragment. */ - if (offset) - return 0; - - uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, - sizeof(_udph), &_udph); - if (uh == NULL) { - /* We've been asked to examine this packet, and we - can't. Hence, no choice but to drop. */ - duprintf("Dropping evil UDP tinygram.\n"); - *hotdrop = 1; - return 0; - } + private = xt_unregister_table(table); - return port_match(udpinfo->spts[0], udpinfo->spts[1], - ntohs(uh->source), - !!(udpinfo->invflags & IPT_UDP_INV_SRCPT)) - && port_match(udpinfo->dpts[0], udpinfo->dpts[1], - ntohs(uh->dest), - !!(udpinfo->invflags & IPT_UDP_INV_DSTPT)); -} - -/* Called when user tries to insert an entry of this type. */ -static int -udp_checkentry(const char *tablename, - const struct ipt_ip *ip, - void *matchinfo, - unsigned int matchinfosize, - unsigned int hook_mask) -{ - const struct ipt_udp *udpinfo = matchinfo; - - /* Must specify proto == UDP, and no unknown invflags */ - if (ip->proto != IPPROTO_UDP || (ip->invflags & IPT_INV_PROTO)) { - duprintf("ipt_udp: Protocol %u != %u\n", ip->proto, - IPPROTO_UDP); - return 0; - } - if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_udp))) { - duprintf("ipt_udp: matchsize %u != %u\n", - matchinfosize, IPT_ALIGN(sizeof(struct ipt_udp))); - return 0; - } - if (udpinfo->invflags & ~IPT_UDP_INV_MASK) { - duprintf("ipt_udp: unknown flags %X\n", - udpinfo->invflags); - return 0; - } - - return 1; + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + xt_free_table_info(private); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ @@ -1730,6 +1272,7 @@ icmp_match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { struct icmphdr _icmph, *ic; @@ -1739,8 +1282,7 @@ icmp_match(const struct sk_buff *skb, if (offset) return 0; - ic = skb_header_pointer(skb, skb->nh.iph->ihl*4, - sizeof(_icmph), &_icmph); + ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph); if (ic == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. @@ -1760,11 +1302,12 @@ icmp_match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int icmp_checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *info, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { + const struct ipt_ip *ip = info; const struct ipt_icmp *icmpinfo = matchinfo; /* Must specify proto == ICMP, and no unknown invflags */ @@ -1794,123 +1337,22 @@ static struct nf_sockopt_ops ipt_sockopts = { .get = do_ipt_get_ctl, }; -static struct ipt_match tcp_matchstruct = { - .name = "tcp", - .match = &tcp_match, - .checkentry = &tcp_checkentry, -}; - -static struct ipt_match udp_matchstruct = { - .name = "udp", - .match = &udp_match, - .checkentry = &udp_checkentry, -}; - static struct ipt_match icmp_matchstruct = { .name = "icmp", .match = &icmp_match, .checkentry = &icmp_checkentry, }; -#ifdef CONFIG_PROC_FS -static inline int print_name(const char *i, - off_t start_offset, char *buffer, int length, - off_t *pos, unsigned int *count) -{ - if ((*count)++ >= start_offset) { - unsigned int namelen; - - namelen = sprintf(buffer + *pos, "%s\n", - i + sizeof(struct list_head)); - if (*pos + namelen > length) { - /* Stop iterating */ - return 1; - } - *pos += namelen; - } - return 0; -} - -static inline int print_target(const struct ipt_target *t, - off_t start_offset, char *buffer, int length, - off_t *pos, unsigned int *count) -{ - if (t == &ipt_standard_target || t == &ipt_error_target) - return 0; - return print_name((char *)t, start_offset, buffer, length, pos, count); -} - -static int ipt_get_tables(char *buffer, char **start, off_t offset, int length) -{ - off_t pos = 0; - unsigned int count = 0; - - if (down_interruptible(&ipt_mutex) != 0) - return 0; - - LIST_FIND(&ipt_tables, print_name, void *, - offset, buffer, length, &pos, &count); - - up(&ipt_mutex); - - /* `start' hack - see fs/proc/generic.c line ~105 */ - *start=(char *)((unsigned long)count-offset); - return pos; -} - -static int ipt_get_targets(char *buffer, char **start, off_t offset, int length) -{ - off_t pos = 0; - unsigned int count = 0; - - if (down_interruptible(&ipt_mutex) != 0) - return 0; - - LIST_FIND(&ipt_target, print_target, struct ipt_target *, - offset, buffer, length, &pos, &count); - - up(&ipt_mutex); - - *start = (char *)((unsigned long)count - offset); - return pos; -} - -static int ipt_get_matches(char *buffer, char **start, off_t offset, int length) -{ - off_t pos = 0; - unsigned int count = 0; - - if (down_interruptible(&ipt_mutex) != 0) - return 0; - - LIST_FIND(&ipt_match, print_name, void *, - offset, buffer, length, &pos, &count); - - up(&ipt_mutex); - - *start = (char *)((unsigned long)count - offset); - return pos; -} - -static const struct { char *name; get_info_t *get_info; } ipt_proc_entry[] = -{ { "ip_tables_names", ipt_get_tables }, - { "ip_tables_targets", ipt_get_targets }, - { "ip_tables_matches", ipt_get_matches }, - { NULL, NULL} }; -#endif /*CONFIG_PROC_FS*/ - static int __init init(void) { int ret; + xt_proto_init(AF_INET); + /* Noone else will be downing sem now, so we won't sleep */ - down(&ipt_mutex); - list_append(&ipt_target, &ipt_standard_target); - list_append(&ipt_target, &ipt_error_target); - list_append(&ipt_match, &tcp_matchstruct); - list_append(&ipt_match, &udp_matchstruct); - list_append(&ipt_match, &icmp_matchstruct); - up(&ipt_mutex); + xt_register_target(AF_INET, &ipt_standard_target); + xt_register_target(AF_INET, &ipt_error_target); + xt_register_match(AF_INET, &icmp_matchstruct); /* Register setsockopt */ ret = nf_register_sockopt(&ipt_sockopts); @@ -1919,49 +1361,23 @@ static int __init init(void) return ret; } -#ifdef CONFIG_PROC_FS - { - struct proc_dir_entry *proc; - int i; - - for (i = 0; ipt_proc_entry[i].name; i++) { - proc = proc_net_create(ipt_proc_entry[i].name, 0, - ipt_proc_entry[i].get_info); - if (!proc) { - while (--i >= 0) - proc_net_remove(ipt_proc_entry[i].name); - nf_unregister_sockopt(&ipt_sockopts); - return -ENOMEM; - } - proc->owner = THIS_MODULE; - } - } -#endif - - printk("ip_tables: (C) 2000-2002 Netfilter core team\n"); + printk("ip_tables: (C) 2000-2006 Netfilter Core Team\n"); return 0; } static void __exit fini(void) { nf_unregister_sockopt(&ipt_sockopts); -#ifdef CONFIG_PROC_FS - { - int i; - for (i = 0; ipt_proc_entry[i].name; i++) - proc_net_remove(ipt_proc_entry[i].name); - } -#endif + + xt_unregister_match(AF_INET, &icmp_matchstruct); + xt_unregister_target(AF_INET, &ipt_error_target); + xt_unregister_target(AF_INET, &ipt_standard_target); + + xt_proto_fini(AF_INET); } EXPORT_SYMBOL(ipt_register_table); EXPORT_SYMBOL(ipt_unregister_table); -EXPORT_SYMBOL(ipt_register_match); -EXPORT_SYMBOL(ipt_unregister_match); EXPORT_SYMBOL(ipt_do_table); -EXPORT_SYMBOL(ipt_register_target); -EXPORT_SYMBOL(ipt_unregister_target); -EXPORT_SYMBOL(ipt_find_target); - module_init(init); module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 45c52d8f4d99..d9bc971f03af 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -379,12 +379,13 @@ target(struct sk_buff **pskb, static int checkentry(const char *tablename, - const struct ipt_entry *e, + const void *e_void, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) { struct ipt_clusterip_tgt_info *cipinfo = targinfo; + const struct ipt_entry *e = e_void; struct clusterip_config *config; diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c index 6e319570a28c..898cdf79ce18 100644 --- a/net/ipv4/netfilter/ipt_DSCP.c +++ b/net/ipv4/netfilter/ipt_DSCP.c @@ -57,7 +57,7 @@ target(struct sk_buff **pskb, static int checkentry(const char *tablename, - const struct ipt_entry *e, + const void *e_void, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index a1319693f648..706445426a6d 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -113,12 +113,13 @@ target(struct sk_buff **pskb, static int checkentry(const char *tablename, - const struct ipt_entry *e, + const void *e_void, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) { const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo; + const struct ipt_entry *e = e_void; if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ECN_info))) { printk(KERN_WARNING "ECN: targinfosize %u != %Zu\n", diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 30be0f1dae37..6606ddb66a29 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -431,7 +431,7 @@ ipt_log_target(struct sk_buff **pskb, } static int ipt_log_checkentry(const char *tablename, - const struct ipt_entry *e, + const void *e, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 275a174c6fe6..12c56d3343ca 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -11,6 +11,7 @@ #include <linux/config.h> #include <linux/types.h> +#include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/timer.h> #include <linux/module.h> @@ -18,6 +19,7 @@ #include <net/protocol.h> #include <net/ip.h> #include <net/checksum.h> +#include <net/route.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv4/ip_nat_rule.h> #include <linux/netfilter_ipv4/ip_tables.h> @@ -38,7 +40,7 @@ static DEFINE_RWLOCK(masq_lock); /* FIXME: Multiple targets. --RR */ static int masquerade_check(const char *tablename, - const struct ipt_entry *e, + const void *e, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index e6e7b6095363..b074467fe67b 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -31,7 +31,7 @@ MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target"); static int check(const char *tablename, - const struct ipt_entry *e, + const void *e, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c deleted file mode 100644 index 3cedc9be8807..000000000000 --- a/net/ipv4/netfilter/ipt_NFQUEUE.c +++ /dev/null @@ -1,70 +0,0 @@ -/* iptables module for using new netfilter netlink queue - * - * (C) 2005 by Harald Welte <laforge@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_NFQUEUE.h> - -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("iptables NFQUEUE target"); -MODULE_LICENSE("GPL"); - -static unsigned int -target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const void *targinfo, - void *userinfo) -{ - const struct ipt_NFQ_info *tinfo = targinfo; - - return NF_QUEUE_NR(tinfo->queuenum); -} - -static int -checkentry(const char *tablename, - const struct ipt_entry *e, - void *targinfo, - unsigned int targinfosize, - unsigned int hook_mask) -{ - if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) { - printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n", - targinfosize, - IPT_ALIGN(sizeof(struct ipt_NFQ_info))); - return 0; - } - - return 1; -} - -static struct ipt_target ipt_NFQ_reg = { - .name = "NFQUEUE", - .target = target, - .checkentry = checkentry, - .me = THIS_MODULE, -}; - -static int __init init(void) -{ - return ipt_register_target(&ipt_NFQ_reg); -} - -static void __exit fini(void) -{ - ipt_unregister_target(&ipt_NFQ_reg); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index 5245bfd33d52..140be51f2f01 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -33,7 +33,7 @@ MODULE_DESCRIPTION("iptables REDIRECT target module"); /* FIXME: Take multiple ranges --RR */ static int redirect_check(const char *tablename, - const struct ipt_entry *e, + const void *e, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index f057025a719e..3eb47aae78c5 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -203,7 +203,7 @@ static void send_reset(struct sk_buff *oldskb, int hook) sizeof(struct tcphdr), 0)); /* Adjust IP TTL, DF */ - nskb->nh.iph->ttl = MAXTTL; + nskb->nh.iph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); /* Set DF, id = 0 */ nskb->nh.iph->frag_off = htons(IP_DF); nskb->nh.iph->id = 0; @@ -282,12 +282,13 @@ static unsigned int reject(struct sk_buff **pskb, } static int check(const char *tablename, - const struct ipt_entry *e, + const void *e_void, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) { const struct ipt_reject_info *rejinfo = targinfo; + const struct ipt_entry *e = e_void; if (targinfosize != IPT_ALIGN(sizeof(struct ipt_reject_info))) { DEBUGP("REJECT: targinfosize %u != 0\n", targinfosize); diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c index 7a0536d864ac..a22de59bba0e 100644 --- a/net/ipv4/netfilter/ipt_SAME.c +++ b/net/ipv4/netfilter/ipt_SAME.c @@ -49,7 +49,7 @@ MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip"); static int same_check(const char *tablename, - const struct ipt_entry *e, + const void *e, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c index 8db70d6908c3..c122841e182c 100644 --- a/net/ipv4/netfilter/ipt_TCPMSS.c +++ b/net/ipv4/netfilter/ipt_TCPMSS.c @@ -210,12 +210,13 @@ static inline int find_syn_match(const struct ipt_entry_match *m) /* Must specify -p tcp --syn/--tcp-flags SYN */ static int ipt_tcpmss_checkentry(const char *tablename, - const struct ipt_entry *e, + const void *e_void, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) { const struct ipt_tcpmss_info *tcpmssinfo = targinfo; + const struct ipt_entry *e = e_void; if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tcpmss_info))) { DEBUGP("ipt_tcpmss_checkentry: targinfosize %u != %u\n", diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c index deadb36d4428..3a44a56db239 100644 --- a/net/ipv4/netfilter/ipt_TOS.c +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -52,7 +52,7 @@ target(struct sk_buff **pskb, static int checkentry(const char *tablename, - const struct ipt_entry *e, + const void *e_void, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c index b9ae6a9382f3..b769eb231970 100644 --- a/net/ipv4/netfilter/ipt_TTL.c +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -66,7 +66,7 @@ ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in, } static int ipt_ttl_checkentry(const char *tablename, - const struct ipt_entry *e, + const void *e, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 2883ccd8a91d..641dbc477650 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -77,15 +77,15 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); #define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0) static unsigned int nlbufsiz = 4096; -module_param(nlbufsiz, uint, 0600); /* FIXME: Check size < 128k --RR */ +module_param(nlbufsiz, uint, 0400); MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); static unsigned int flushtimeout = 10; -module_param(flushtimeout, int, 0600); +module_param(flushtimeout, uint, 0600); MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); -static unsigned int nflog = 1; -module_param(nflog, int, 0400); +static int nflog = 1; +module_param(nflog, bool, 0400); MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); /* global data structures */ @@ -330,7 +330,7 @@ static void ipt_logfn(unsigned int pf, } static int ipt_ulog_checkentry(const char *tablename, - const struct ipt_entry *e, + const void *e, void *targinfo, unsigned int targinfosize, unsigned int hookmask) @@ -376,7 +376,7 @@ static int __init init(void) DEBUGP("ipt_ULOG: init module\n"); - if (nlbufsiz >= 128*1024) { + if (nlbufsiz > 128*1024) { printk("Netlink buffer has to be <= 128kB\n"); return -EINVAL; } diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index e19c2a52d00c..d6b83a976518 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -29,7 +29,7 @@ static inline int match_type(u_int32_t addr, u_int16_t mask) static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const void *matchinfo, - int offset, int *hotdrop) + int offset, unsigned int protoff, int *hotdrop) { const struct ipt_addrtype_info *info = matchinfo; const struct iphdr *iph = skb->nh.iph; @@ -43,7 +43,7 @@ static int match(const struct sk_buff *skb, const struct net_device *in, return ret; } -static int checkentry(const char *tablename, const struct ipt_ip *ip, +static int checkentry(const char *tablename, const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index a0fea847cb72..144adfec13cc 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -41,6 +41,7 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { struct ip_auth_hdr _ahdr, *ah; @@ -50,7 +51,7 @@ match(const struct sk_buff *skb, if (offset) return 0; - ah = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + ah = skb_header_pointer(skb, protoff, sizeof(_ahdr), &_ahdr); if (ah == NULL) { /* We've been asked to examine this packet, and we @@ -69,12 +70,13 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *ip_void, void *matchinfo, unsigned int matchinfosize, unsigned int hook_mask) { const struct ipt_ah *ahinfo = matchinfo; + const struct ipt_ip *ip = ip_void; /* Must specify proto == AH, and no unknown invflags */ if (ip->proto != IPPROTO_AH || (ip->invflags & IPT_INV_PROTO)) { diff --git a/net/ipv4/netfilter/ipt_dscp.c b/net/ipv4/netfilter/ipt_dscp.c index 5df52a64a5d4..92063b4f8602 100644 --- a/net/ipv4/netfilter/ipt_dscp.c +++ b/net/ipv4/netfilter/ipt_dscp.c @@ -21,7 +21,7 @@ MODULE_LICENSE("GPL"); static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const void *matchinfo, - int offset, int *hotdrop) + int offset, unsigned int protoff, int *hotdrop) { const struct ipt_dscp_info *info = matchinfo; const struct iphdr *iph = skb->nh.iph; @@ -31,7 +31,7 @@ static int match(const struct sk_buff *skb, const struct net_device *in, return ((iph->tos&IPT_DSCP_MASK) == sh_dscp) ^ info->invert; } -static int checkentry(const char *tablename, const struct ipt_ip *ip, +static int checkentry(const char *tablename, const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index b6f7181e89cc..e68b0c7981f0 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -67,7 +67,7 @@ static inline int match_tcp(const struct sk_buff *skb, static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const void *matchinfo, - int offset, int *hotdrop) + int offset, unsigned int protoff, int *hotdrop) { const struct ipt_ecn_info *info = matchinfo; @@ -85,11 +85,12 @@ static int match(const struct sk_buff *skb, const struct net_device *in, return 1; } -static int checkentry(const char *tablename, const struct ipt_ip *ip, +static int checkentry(const char *tablename, const void *ip_void, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { const struct ipt_ecn_info *info = matchinfo; + const struct ipt_ip *ip = ip_void; if (matchsize != IPT_ALIGN(sizeof(struct ipt_ecn_info))) return 0; diff --git a/net/ipv4/netfilter/ipt_esp.c b/net/ipv4/netfilter/ipt_esp.c index e1d0dd31e117..9de191a8162d 100644 --- a/net/ipv4/netfilter/ipt_esp.c +++ b/net/ipv4/netfilter/ipt_esp.c @@ -42,6 +42,7 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { struct ip_esp_hdr _esp, *eh; @@ -51,7 +52,7 @@ match(const struct sk_buff *skb, if (offset) return 0; - eh = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + eh = skb_header_pointer(skb, protoff, sizeof(_esp), &_esp); if (eh == NULL) { /* We've been asked to examine this packet, and we @@ -70,12 +71,13 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *ip_void, void *matchinfo, unsigned int matchinfosize, unsigned int hook_mask) { const struct ipt_esp *espinfo = matchinfo; + const struct ipt_ip *ip = ip_void; /* Must specify proto == ESP, and no unknown invflags */ if (ip->proto != IPPROTO_ESP || (ip->invflags & IPT_INV_PROTO)) { diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 2dd1cccbdab9..4fe48c1bd5f3 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -429,6 +429,7 @@ hashlimit_match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { struct ipt_hashlimit_info *r = @@ -504,7 +505,7 @@ hashlimit_match(const struct sk_buff *skb, static int hashlimit_checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *inf, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c index b835b7b2e560..13fb16fb7892 100644 --- a/net/ipv4/netfilter/ipt_iprange.c +++ b/net/ipv4/netfilter/ipt_iprange.c @@ -28,7 +28,7 @@ match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const void *matchinfo, - int offset, int *hotdrop) + int offset, unsigned int protoff, int *hotdrop) { const struct ipt_iprange_info *info = matchinfo; const struct iphdr *iph = skb->nh.iph; @@ -63,7 +63,7 @@ match(const struct sk_buff *skb, } static int check(const char *tablename, - const struct ipt_ip *ip, + const void *inf, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_length.c b/net/ipv4/netfilter/ipt_length.c deleted file mode 100644 index 4eabcfbda9d1..000000000000 --- a/net/ipv4/netfilter/ipt_length.c +++ /dev/null @@ -1,64 +0,0 @@ -/* Kernel module to match packet length. */ -/* (C) 1999-2001 James Morris <jmorros@intercode.com.au> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter_ipv4/ipt_length.h> -#include <linux/netfilter_ipv4/ip_tables.h> - -MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); -MODULE_DESCRIPTION("IP tables packet length matching module"); -MODULE_LICENSE("GPL"); - -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const void *matchinfo, - int offset, - int *hotdrop) -{ - const struct ipt_length_info *info = matchinfo; - u_int16_t pktlen = ntohs(skb->nh.iph->tot_len); - - return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; -} - -static int -checkentry(const char *tablename, - const struct ipt_ip *ip, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) -{ - if (matchsize != IPT_ALIGN(sizeof(struct ipt_length_info))) - return 0; - - return 1; -} - -static struct ipt_match length_match = { - .name = "length", - .match = &match, - .checkentry = &checkentry, - .me = THIS_MODULE, -}; - -static int __init init(void) -{ - return ipt_register_match(&length_match); -} - -static void __exit fini(void) -{ - ipt_unregister_match(&length_match); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c index 99e8188162e2..2d52326553f1 100644 --- a/net/ipv4/netfilter/ipt_multiport.c +++ b/net/ipv4/netfilter/ipt_multiport.c @@ -97,6 +97,7 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { u16 _ports[2], *pptr; @@ -105,7 +106,7 @@ match(const struct sk_buff *skb, if (offset) return 0; - pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports); if (pptr == NULL) { /* We've been asked to examine this packet, and we @@ -128,6 +129,7 @@ match_v1(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { u16 _ports[2], *pptr; @@ -136,7 +138,7 @@ match_v1(const struct sk_buff *skb, if (offset) return 0; - pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports); if (pptr == NULL) { /* We've been asked to examine this packet, and we @@ -154,7 +156,7 @@ match_v1(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) @@ -164,7 +166,7 @@ checkentry(const char *tablename, static int checkentry_v1(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c index 0cee2862ed85..4843d0c9734f 100644 --- a/net/ipv4/netfilter/ipt_owner.c +++ b/net/ipv4/netfilter/ipt_owner.c @@ -27,6 +27,7 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { const struct ipt_owner_info *info = matchinfo; @@ -51,7 +52,7 @@ match(const struct sk_buff *skb, static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c deleted file mode 100644 index 1a53924041fc..000000000000 --- a/net/ipv4/netfilter/ipt_physdev.c +++ /dev/null @@ -1,134 +0,0 @@ -/* Kernel module to match the bridge port in and - * out device for IP packets coming into contact with a bridge. */ - -/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/netfilter_ipv4/ipt_physdev.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_bridge.h> -#define MATCH 1 -#define NOMATCH 0 - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); -MODULE_DESCRIPTION("iptables bridge physical device match module"); - -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const void *matchinfo, - int offset, - int *hotdrop) -{ - int i; - static const char nulldevname[IFNAMSIZ]; - const struct ipt_physdev_info *info = matchinfo; - unsigned int ret; - const char *indev, *outdev; - struct nf_bridge_info *nf_bridge; - - /* Not a bridged IP packet or no info available yet: - * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if - * the destination device will be a bridge. */ - if (!(nf_bridge = skb->nf_bridge)) { - /* Return MATCH if the invert flags of the used options are on */ - if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) && - !(info->invert & IPT_PHYSDEV_OP_BRIDGED)) - return NOMATCH; - if ((info->bitmask & IPT_PHYSDEV_OP_ISIN) && - !(info->invert & IPT_PHYSDEV_OP_ISIN)) - return NOMATCH; - if ((info->bitmask & IPT_PHYSDEV_OP_ISOUT) && - !(info->invert & IPT_PHYSDEV_OP_ISOUT)) - return NOMATCH; - if ((info->bitmask & IPT_PHYSDEV_OP_IN) && - !(info->invert & IPT_PHYSDEV_OP_IN)) - return NOMATCH; - if ((info->bitmask & IPT_PHYSDEV_OP_OUT) && - !(info->invert & IPT_PHYSDEV_OP_OUT)) - return NOMATCH; - return MATCH; - } - - /* This only makes sense in the FORWARD and POSTROUTING chains */ - if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) && - (!!(nf_bridge->mask & BRNF_BRIDGED) ^ - !(info->invert & IPT_PHYSDEV_OP_BRIDGED))) - return NOMATCH; - - if ((info->bitmask & IPT_PHYSDEV_OP_ISIN && - (!nf_bridge->physindev ^ !!(info->invert & IPT_PHYSDEV_OP_ISIN))) || - (info->bitmask & IPT_PHYSDEV_OP_ISOUT && - (!nf_bridge->physoutdev ^ !!(info->invert & IPT_PHYSDEV_OP_ISOUT)))) - return NOMATCH; - - if (!(info->bitmask & IPT_PHYSDEV_OP_IN)) - goto match_outdev; - indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; - for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { - ret |= (((const unsigned int *)indev)[i] - ^ ((const unsigned int *)info->physindev)[i]) - & ((const unsigned int *)info->in_mask)[i]; - } - - if ((ret == 0) ^ !(info->invert & IPT_PHYSDEV_OP_IN)) - return NOMATCH; - -match_outdev: - if (!(info->bitmask & IPT_PHYSDEV_OP_OUT)) - return MATCH; - outdev = nf_bridge->physoutdev ? - nf_bridge->physoutdev->name : nulldevname; - for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { - ret |= (((const unsigned int *)outdev)[i] - ^ ((const unsigned int *)info->physoutdev)[i]) - & ((const unsigned int *)info->out_mask)[i]; - } - - return (ret != 0) ^ !(info->invert & IPT_PHYSDEV_OP_OUT); -} - -static int -checkentry(const char *tablename, - const struct ipt_ip *ip, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) -{ - const struct ipt_physdev_info *info = matchinfo; - - if (matchsize != IPT_ALIGN(sizeof(struct ipt_physdev_info))) - return 0; - if (!(info->bitmask & IPT_PHYSDEV_OP_MASK) || - info->bitmask & ~IPT_PHYSDEV_OP_MASK) - return 0; - return 1; -} - -static struct ipt_match physdev_match = { - .name = "physdev", - .match = &match, - .checkentry = &checkentry, - .me = THIS_MODULE, -}; - -static int __init init(void) -{ - return ipt_register_match(&physdev_match); -} - -static void __exit fini(void) -{ - ipt_unregister_match(&physdev_match); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_policy.c b/net/ipv4/netfilter/ipt_policy.c new file mode 100644 index 000000000000..709debcc69c9 --- /dev/null +++ b/net/ipv4/netfilter/ipt_policy.c @@ -0,0 +1,170 @@ +/* IP tables module for matching IPsec policy + * + * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <net/xfrm.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_policy.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("IPtables IPsec policy matching module"); +MODULE_LICENSE("GPL"); + + +static inline int +match_xfrm_state(struct xfrm_state *x, const struct ipt_policy_elem *e) +{ +#define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) + + return MATCH(saddr, x->props.saddr.a4 & e->smask) && + MATCH(daddr, x->id.daddr.a4 & e->dmask) && + MATCH(proto, x->id.proto) && + MATCH(mode, x->props.mode) && + MATCH(spi, x->id.spi) && + MATCH(reqid, x->props.reqid); +} + +static int +match_policy_in(const struct sk_buff *skb, const struct ipt_policy_info *info) +{ + const struct ipt_policy_elem *e; + struct sec_path *sp = skb->sp; + int strict = info->flags & IPT_POLICY_MATCH_STRICT; + int i, pos; + + if (sp == NULL) + return -1; + if (strict && info->len != sp->len) + return 0; + + for (i = sp->len - 1; i >= 0; i--) { + pos = strict ? i - sp->len + 1 : 0; + if (pos >= info->len) + return 0; + e = &info->pol[pos]; + + if (match_xfrm_state(sp->x[i].xvec, e)) { + if (!strict) + return 1; + } else if (strict) + return 0; + } + + return strict ? 1 : 0; +} + +static int +match_policy_out(const struct sk_buff *skb, const struct ipt_policy_info *info) +{ + const struct ipt_policy_elem *e; + struct dst_entry *dst = skb->dst; + int strict = info->flags & IPT_POLICY_MATCH_STRICT; + int i, pos; + + if (dst->xfrm == NULL) + return -1; + + for (i = 0; dst && dst->xfrm; dst = dst->child, i++) { + pos = strict ? i : 0; + if (pos >= info->len) + return 0; + e = &info->pol[pos]; + + if (match_xfrm_state(dst->xfrm, e)) { + if (!strict) + return 1; + } else if (strict) + return 0; + } + + return strict ? 1 : 0; +} + +static int match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, int offset, int *hotdrop) +{ + const struct ipt_policy_info *info = matchinfo; + int ret; + + if (info->flags & IPT_POLICY_MATCH_IN) + ret = match_policy_in(skb, info); + else + ret = match_policy_out(skb, info); + + if (ret < 0) + ret = info->flags & IPT_POLICY_MATCH_NONE ? 1 : 0; + else if (info->flags & IPT_POLICY_MATCH_NONE) + ret = 0; + + return ret; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_policy_info *info = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(*info))) { + printk(KERN_ERR "ipt_policy: matchsize %u != %zu\n", + matchsize, IPT_ALIGN(sizeof(*info))); + return 0; + } + if (!(info->flags & (IPT_POLICY_MATCH_IN|IPT_POLICY_MATCH_OUT))) { + printk(KERN_ERR "ipt_policy: neither incoming nor " + "outgoing policy selected\n"); + return 0; + } + if (hook_mask & (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_LOCAL_IN) + && info->flags & IPT_POLICY_MATCH_OUT) { + printk(KERN_ERR "ipt_policy: output policy not valid in " + "PRE_ROUTING and INPUT\n"); + return 0; + } + if (hook_mask & (1 << NF_IP_POST_ROUTING | 1 << NF_IP_LOCAL_OUT) + && info->flags & IPT_POLICY_MATCH_IN) { + printk(KERN_ERR "ipt_policy: input policy not valid in " + "POST_ROUTING and OUTPUT\n"); + return 0; + } + if (info->len > IPT_POLICY_MAX_ELEM) { + printk(KERN_ERR "ipt_policy: too many policy elements\n"); + return 0; + } + + return 1; +} + +static struct ipt_match policy_match = { + .name = "policy", + .match = match, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&policy_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&policy_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 261cbb4d4c49..44611d6d14f5 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -24,10 +24,10 @@ #define HASH_LOG 9 /* Defaults, these can be overridden on the module command-line. */ -static int ip_list_tot = 100; -static int ip_pkt_list_tot = 20; -static int ip_list_hash_size = 0; -static int ip_list_perms = 0644; +static unsigned int ip_list_tot = 100; +static unsigned int ip_pkt_list_tot = 20; +static unsigned int ip_list_hash_size = 0; +static unsigned int ip_list_perms = 0644; #ifdef DEBUG static int debug = 1; #endif @@ -38,13 +38,13 @@ KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>. htt MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>"); MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); MODULE_LICENSE("GPL"); -module_param(ip_list_tot, int, 0400); -module_param(ip_pkt_list_tot, int, 0400); -module_param(ip_list_hash_size, int, 0400); -module_param(ip_list_perms, int, 0400); +module_param(ip_list_tot, uint, 0400); +module_param(ip_pkt_list_tot, uint, 0400); +module_param(ip_list_hash_size, uint, 0400); +module_param(ip_list_perms, uint, 0400); #ifdef DEBUG -module_param(debug, int, 0600); -MODULE_PARM_DESC(debug,"debugging level, defaults to 1"); +module_param(debug, bool, 0600); +MODULE_PARM_DESC(debug,"enable debugging output"); #endif MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); @@ -104,6 +104,7 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop); /* Function to hash a given address into the hash table of table_size size */ @@ -317,7 +318,7 @@ static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned skb->nh.iph->daddr = 0; /* Clear ttl since we have no way of knowing it */ skb->nh.iph->ttl = 0; - match(skb,NULL,NULL,info,0,NULL); + match(skb,NULL,NULL,info,0,0,NULL); kfree(skb->nh.iph); out_free_skb: @@ -357,6 +358,7 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { int pkt_count, hits_found, ans; @@ -654,7 +656,7 @@ match(const struct sk_buff *skb, */ static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c index 086a1bb61e3e..9ab765e126f2 100644 --- a/net/ipv4/netfilter/ipt_tos.c +++ b/net/ipv4/netfilter/ipt_tos.c @@ -23,6 +23,7 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { const struct ipt_tos_info *info = matchinfo; @@ -32,7 +33,7 @@ match(const struct sk_buff *skb, static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c index 219aa9de88cc..82da53f430ab 100644 --- a/net/ipv4/netfilter/ipt_ttl.c +++ b/net/ipv4/netfilter/ipt_ttl.c @@ -21,7 +21,7 @@ MODULE_LICENSE("GPL"); static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const void *matchinfo, - int offset, int *hotdrop) + int offset, unsigned int protoff, int *hotdrop) { const struct ipt_ttl_info *info = matchinfo; @@ -47,7 +47,7 @@ static int match(const struct sk_buff *skb, const struct net_device *in, return 0; } -static int checkentry(const char *tablename, const struct ipt_ip *ip, +static int checkentry(const char *tablename, const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 260a4f0a2a90..212a3079085b 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -78,7 +78,8 @@ static struct ipt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .lock = RW_LOCK_UNLOCKED, - .me = THIS_MODULE + .me = THIS_MODULE, + .af = AF_INET, }; /* The work comes in here from netfilter.c. */ diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 160eb11b6e2f..3212a5cc4b6b 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -109,6 +109,7 @@ static struct ipt_table packet_mangler = { .valid_hooks = MANGLE_VALID_HOOKS, .lock = RW_LOCK_UNLOCKED, .me = THIS_MODULE, + .af = AF_INET, }; /* The work comes in here from netfilter.c. */ diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 47449ba83eb9..fdb9e9c81e81 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -83,7 +83,8 @@ static struct ipt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .lock = RW_LOCK_UNLOCKED, - .me = THIS_MODULE + .me = THIS_MODULE, + .af = AF_INET, }; /* The work comes in here from netfilter.c. */ diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 8202c1c0afad..167619f638c6 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -22,6 +22,7 @@ #include <linux/skbuff.h> #include <linux/icmp.h> #include <linux/sysctl.h> +#include <net/route.h> #include <net/ip.h> #include <linux/netfilter_ipv4.h> @@ -180,30 +181,6 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, return NF_ACCEPT; } -static unsigned int ipv4_refrag(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct rtable *rt = (struct rtable *)(*pskb)->dst; - - /* We've seen it coming out the other side: confirm */ - if (ipv4_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) - return NF_DROP; - - /* Local packets are never produced too large for their - interface. We degfragment them at LOCAL_OUT, however, - so we have to refragment them here. */ - if ((*pskb)->len > dst_mtu(&rt->u.dst) && - !skb_shinfo(*pskb)->tso_size) { - /* No hook can be after us, so this should be OK. */ - ip_fragment(*pskb, okfn); - return NF_STOLEN; - } - return NF_ACCEPT; -} - static unsigned int ipv4_conntrack_in(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, @@ -283,7 +260,7 @@ static struct nf_hook_ops ipv4_conntrack_helper_in_ops = { /* Refragmenter; last chance. */ static struct nf_hook_ops ipv4_conntrack_out_ops = { - .hook = ipv4_refrag, + .hook = ipv4_confirm, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_POST_ROUTING, @@ -300,7 +277,7 @@ static struct nf_hook_ops ipv4_conntrack_local_in_ops = { #ifdef CONFIG_SYSCTL /* From nf_conntrack_proto_icmp.c */ -extern unsigned long nf_ct_icmp_timeout; +extern unsigned int nf_ct_icmp_timeout; static struct ctl_table_header *nf_ct_ipv4_sysctl_header; static ctl_table nf_ct_sysctl_table[] = { @@ -392,6 +369,48 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -ENOENT; } +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static int ipv4_tuple_to_nfattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), + &tuple->src.u3.ip); + NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), + &tuple->dst.u3.ip); + return 0; + +nfattr_failure: + return -1; +} + +static const size_t cta_min_ip[CTA_IP_MAX] = { + [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), + [CTA_IP_V4_DST-1] = sizeof(u_int32_t), +}; + +static int ipv4_nfattr_to_tuple(struct nfattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V4_SRC-1] || !tb[CTA_IP_V4_DST-1]) + return -EINVAL; + + if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) + return -EINVAL; + + t->src.u3.ip = + *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]); + t->dst.u3.ip = + *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]); + + return 0; +} +#endif + static struct nf_sockopt_ops so_getorigdst = { .pf = PF_INET, .get_optmin = SO_ORIGINAL_DST, @@ -408,6 +427,11 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .print_conntrack = ipv4_print_conntrack, .prepare = ipv4_prepare, .get_features = ipv4_get_features, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = ipv4_tuple_to_nfattr, + .nfattr_to_tuple = ipv4_nfattr_to_tuple, +#endif .me = THIS_MODULE, }; @@ -551,7 +575,7 @@ MODULE_LICENSE("GPL"); static int __init init(void) { - need_nf_conntrack(); + need_conntrack(); return init_or_cleanup(1); } @@ -563,9 +587,4 @@ static void __exit fini(void) module_init(init); module_exit(fini); -void need_ip_conntrack(void) -{ -} - -EXPORT_SYMBOL(need_ip_conntrack); EXPORT_SYMBOL(nf_ct_ipv4_gather_frags); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 7ddb5c08f7b8..52dc175be39a 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -50,20 +50,21 @@ static int icmp_pkt_to_tuple(const struct sk_buff *skb, return 1; } +/* Add 1; spaces filled with 0. */ +static const u_int8_t invmap[] = { + [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 +}; + static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { - /* Add 1; spaces filled with 0. */ - static u_int8_t invmap[] - = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, - [ICMP_ECHOREPLY] = ICMP_ECHO + 1, - [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, - [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, - [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, - [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, - [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, - [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; - if (orig->dst.u.icmp.type >= sizeof(invmap) || !invmap[orig->dst.u.icmp.type]) return 0; @@ -120,11 +121,12 @@ static int icmp_packet(struct nf_conn *ct, static int icmp_new(struct nf_conn *conntrack, const struct sk_buff *skb, unsigned int dataoff) { - static u_int8_t valid_new[] - = { [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 }; + static const u_int8_t valid_new[] = { + [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 + }; if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { @@ -168,7 +170,7 @@ icmp_error_message(struct sk_buff *skb, return -NF_ACCEPT; } - innerproto = nf_ct_find_proto(PF_INET, inside->ip.protocol); + innerproto = __nf_ct_proto_find(PF_INET, inside->ip.protocol); dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET, @@ -281,6 +283,60 @@ checksum_skipped: return icmp_error_message(skb, ctinfo, hooknum); } +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static int icmp_tuple_to_nfattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *t) +{ + NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), + &t->src.u.icmp.id); + NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), + &t->dst.u.icmp.type); + NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), + &t->dst.u.icmp.code); + + return 0; + +nfattr_failure: + return -1; +} + +static const size_t cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t) +}; + +static int icmp_nfattr_to_tuple(struct nfattr *tb[], + struct nf_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMP_TYPE-1] + || !tb[CTA_PROTO_ICMP_CODE-1] + || !tb[CTA_PROTO_ICMP_ID-1]) + return -EINVAL; + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + tuple->dst.u.icmp.type = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); + tuple->dst.u.icmp.code = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); + tuple->src.u.icmp.id = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + + if (tuple->dst.u.icmp.type >= sizeof(invmap) + || !invmap[tuple->dst.u.icmp.type]) + return -EINVAL; + + return 0; +} +#endif + struct nf_conntrack_protocol nf_conntrack_protocol_icmp = { .list = { NULL, NULL }, @@ -295,7 +351,12 @@ struct nf_conntrack_protocol nf_conntrack_protocol_icmp = .new = icmp_new, .error = icmp_error, .destroy = NULL, - .me = NULL + .me = NULL, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = icmp_tuple_to_nfattr, + .nfattr_to_tuple = icmp_nfattr_to_tuple, +#endif }; EXPORT_SYMBOL(nf_conntrack_protocol_icmp); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 0d7dc668db46..39d49dc333a7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -38,6 +38,7 @@ #include <net/protocol.h> #include <net/tcp.h> #include <net/udp.h> +#include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/sock.h> diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4b0d7e4d6269..165a4d81efa4 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -255,6 +255,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); return NET_RX_DROP; } + nf_reset(skb); skb_push(skb, skb->data - skb->nh.raw); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index a34e60ea48a1..e20be3331f67 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; - child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); if (child) inet_csk_reqsk_queue_add(sk, req, child); else diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 01444a02b48b..16984d4a8a06 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -12,6 +12,7 @@ #include <linux/sysctl.h> #include <linux/config.h> #include <linux/igmp.h> +#include <linux/inetdevice.h> #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> @@ -22,6 +23,7 @@ extern int sysctl_ip_nonlocal_bind; #ifdef CONFIG_SYSCTL +static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -614,6 +616,15 @@ ctl_table ipv4_table[] = { .strategy = &sysctl_jiffies }, { + .ctl_name = NET_IPV4_IPFRAG_MAX_DIST, + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero + }, + { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef98b14ac56d..00aa80e93243 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int err = 0; if (level != SOL_TCP) - return tp->af_specific->setsockopt(sk, level, optname, - optval, optlen); + return icsk->icsk_af_ops->setsockopt(sk, level, optname, + optval, optlen); /* This is a string value all the others are int's */ if (optname == TCP_CONGESTION) { @@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); - info->tcpi_pmtu = tp->pmtu_cookie; + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; @@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int val, len; if (level != SOL_TCP) - return tp->af_specific->getsockopt(sk, level, optname, - optval, optlen); + return icsk->icsk_af_ops->getsockopt(sk, level, optname, + optval, optlen); if (get_user(len, optlen)) return -EFAULT; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 1d0cd86621b1..035f2092d73a 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -30,8 +30,6 @@ static int fast_convergence = 1; static int max_increment = 16; static int low_window = 14; static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ -static int low_utilization_threshold = 153; -static int low_utilization_period = 2; static int initial_ssthresh = 100; static int smooth_part = 20; @@ -43,10 +41,6 @@ module_param(low_window, int, 0644); MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); -module_param(low_utilization_threshold, int, 0644); -MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); -module_param(low_utilization_period, int, 0644); -MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(smooth_part, int, 0644); @@ -60,11 +54,6 @@ struct bictcp { u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ - u32 delay_min; /* min delay */ - u32 delay_max; /* max delay */ - u32 last_delay; - u8 low_utilization;/* 0: high; 1: low */ - u32 low_utilization_start; /* starting time of low utilization detection*/ u32 epoch_start; /* beginning of an epoch */ #define ACK_RATIO_SHIFT 4 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ @@ -77,11 +66,6 @@ static inline void bictcp_reset(struct bictcp *ca) ca->loss_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; - ca->delay_min = 0; - ca->delay_max = 0; - ca->last_delay = 0; - ca->low_utilization = 0; - ca->low_utilization_start = 0; ca->epoch_start = 0; ca->delayed_ack = 2 << ACK_RATIO_SHIFT; } @@ -143,8 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) } /* if in slow start or link utilization is very low */ - if ( ca->loss_cwnd == 0 || - (cwnd > ca->loss_cwnd && ca->low_utilization)) { + if (ca->loss_cwnd == 0) { if (ca->cnt > 20) /* increase cwnd 5% per RTT */ ca->cnt = 20; } @@ -154,69 +137,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } - -/* Detect low utilization in congestion avoidance */ -static inline void bictcp_low_utilization(struct sock *sk, int flag) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); - u32 dist, delay; - - /* No time stamp */ - if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || - /* Discard delay samples right after fast recovery */ - tcp_time_stamp < ca->epoch_start + HZ || - /* this delay samples may not be accurate */ - flag == 0) { - ca->last_delay = 0; - goto notlow; - } - - delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ - ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - if (delay == 0) /* no previous delay sample */ - goto notlow; - - /* first time call or link delay decreases */ - if (ca->delay_min == 0 || ca->delay_min > delay) { - ca->delay_min = ca->delay_max = delay; - goto notlow; - } - - if (ca->delay_max < delay) - ca->delay_max = delay; - - /* utilization is low, if avg delay < dist*threshold - for checking_period time */ - dist = ca->delay_max - ca->delay_min; - if (dist <= ca->delay_min>>6 || - tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) - goto notlow; - - if (ca->low_utilization_start == 0) { - ca->low_utilization = 0; - ca->low_utilization_start = tcp_time_stamp; - } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) - > low_utilization_period*HZ) { - ca->low_utilization = 1; - } - - return; - - notlow: - ca->low_utilization = 0; - ca->low_utilization_start = 0; - -} - static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - bictcp_low_utilization(sk, data_acked); - if (!tcp_is_cwnd_limited(sk, in_flight)) return; @@ -249,11 +175,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) ca->epoch_start = 0; /* end of epoch */ - /* in case of wrong delay_max*/ - if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) - ca->delay_max = ca->delay_min - + ((ca->delay_max - ca->delay_min)* 90) / 100; - /* Wmax and fast convergence */ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) @@ -289,14 +210,14 @@ static void bictcp_state(struct sock *sk, u8 new_state) bictcp_reset(inet_csk_ca(sk)); } -/* Track delayed acknowledgement ratio using sliding window +/* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ static void bictcp_acked(struct sock *sk, u32 cnt) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index c7cc62c8dc12..e688c687d62d 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } + +/* + * Linear increase during slow start + */ +void tcp_slow_start(struct tcp_sock *tp) +{ + if (sysctl_tcp_abc) { + /* RFC3465: Slow Start + * TCP sender SHOULD increase cwnd by the number of + * previously unacknowledged bytes ACKed by each incoming + * acknowledgment, provided the increase is not more than L + */ + if (tp->bytes_acked < tp->mss_cache) + return; + + /* We MAY increase by 2 if discovered delayed ack */ + if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } + tp->bytes_acked = 0; + + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; +} +EXPORT_SYMBOL_GPL(tcp_slow_start); + /* * TCP Reno congestion control * This is special case used for fallback as well. diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c new file mode 100644 index 000000000000..31a4986dfbf7 --- /dev/null +++ b/net/ipv4/tcp_cubic.c @@ -0,0 +1,411 @@ +/* + * TCP CUBIC: Binary Increase Congestion control for TCP v2.0 + * + * This is from the implementation of CUBIC TCP in + * Injong Rhee, Lisong Xu. + * "CUBIC: A New TCP-Friendly High-Speed TCP Variant + * in PFLDnet 2005 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + * + * Unless CUBIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <net/tcp.h> +#include <asm/div64.h> + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ +#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ + +static int fast_convergence = 1; +static int max_increment = 16; +static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int initial_ssthresh = 100; +static int bic_scale = 41; +static int tcp_friendliness = 1; + +static u32 cube_rtt_scale; +static u32 beta_scale; +static u64 cube_factor; + +/* Note parameters that are used for precomputing scale factors are read-only */ +module_param(fast_convergence, int, 0644); +MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); +module_param(max_increment, int, 0644); +MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); +module_param(beta, int, 0444); +MODULE_PARM_DESC(beta, "beta for multiplicative increase"); +module_param(initial_ssthresh, int, 0644); +MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); +module_param(bic_scale, int, 0444); +MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); +module_param(tcp_friendliness, int, 0644); +MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); + +#include <asm/div64.h> + +/* BIC TCP Parameters */ +struct bictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 bic_origin_point;/* origin point of bic function */ + u32 bic_K; /* time to origin point from the beginning of the current epoch */ + u32 delay_min; /* min delay */ + u32 epoch_start; /* beginning of an epoch */ + u32 ack_cnt; /* number of acks */ + u32 tcp_cwnd; /* estimated tcp cwnd */ +#define ACK_RATIO_SHIFT 4 + u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->bic_origin_point = 0; + ca->bic_K = 0; + ca->delay_min = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; + ca->ack_cnt = 0; + ca->tcp_cwnd = 0; +} + +static void bictcp_init(struct sock *sk) +{ + bictcp_reset(inet_csk_ca(sk)); + if (initial_ssthresh) + tcp_sk(sk)->snd_ssthresh = initial_ssthresh; +} + +/* 64bit divisor, dividend and result. dynamic precision */ +static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) +{ + u_int32_t d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (uint32_t) dividend / d; + + return dividend; +} + +/* + * calculate the cubic root of x using Newton-Raphson + */ +static u32 cubic_root(u64 a) +{ + u32 x, x1; + + /* Initial estimate is based on: + * cbrt(x) = exp(log(x) / 3) + */ + x = 1u << (fls64(a)/3); + + /* + * Iteration based on: + * 2 + * x = ( 2 * x + a / x ) / 3 + * k+1 k k + */ + do { + x1 = x; + x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; + } while (abs(x1 - x) > 1); + + return x; +} + +/* + * Compute congestion window to use. + */ +static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +{ + u64 offs; + u32 delta, t, bic_target, min_cnt, max_cnt; + + ca->ack_cnt++; /* count the number of ACKs */ + + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + if (ca->epoch_start == 0) { + ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ + ca->ack_cnt = 1; /* start counting */ + ca->tcp_cwnd = cwnd; /* syn with cubic */ + + if (ca->last_max_cwnd <= cwnd) { + ca->bic_K = 0; + ca->bic_origin_point = cwnd; + } else { + /* Compute new K based on + * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) + */ + ca->bic_K = cubic_root(cube_factor + * (ca->last_max_cwnd - cwnd)); + ca->bic_origin_point = ca->last_max_cwnd; + } + } + + /* cubic function - calc*/ + /* calculate c * time^3 / rtt, + * while considering overflow in calculation of time^3 + * (so time^3 is done by using 64 bit) + * and without the support of division of 64bit numbers + * (so all divisions are done by using 32 bit) + * also NOTE the unit of those veriables + * time = (t - K) / 2^bictcp_HZ + * c = bic_scale >> 10 + * rtt = (srtt >> 3) / HZ + * !!! The following code does not have overflow problems, + * if the cwnd < 1 million packets !!! + */ + + /* change the unit from HZ to bictcp_HZ */ + t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start) + << BICTCP_HZ) / HZ; + + if (t < ca->bic_K) /* t - K */ + offs = ca->bic_K - t; + else + offs = t - ca->bic_K; + + /* c/rtt * (t-K)^3 */ + delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); + if (t < ca->bic_K) /* below origin*/ + bic_target = ca->bic_origin_point - delta; + else /* above origin*/ + bic_target = ca->bic_origin_point + delta; + + /* cubic function - calc bictcp_cnt*/ + if (bic_target > cwnd) { + ca->cnt = cwnd / (bic_target - cwnd); + } else { + ca->cnt = 100 * cwnd; /* very small increment*/ + } + + if (ca->delay_min > 0) { + /* max increment = Smax * rtt / 0.1 */ + min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min); + if (ca->cnt < min_cnt) + ca->cnt = min_cnt; + } + + /* slow start and low utilization */ + if (ca->loss_cwnd == 0) /* could be aggressive in slow start */ + ca->cnt = 50; + + /* TCP Friendly */ + if (tcp_friendliness) { + u32 scale = beta_scale; + delta = (cwnd * scale) >> 3; + while (ca->ack_cnt > delta) { /* update tcp cwnd */ + ca->ack_cnt -= delta; + ca->tcp_cwnd++; + } + + if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ + delta = ca->tcp_cwnd - cwnd; + max_cnt = cwnd / delta; + if (ca->cnt > max_cnt) + ca->cnt = max_cnt; + } + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + + +/* Keep track of minimum rtt */ +static inline void measure_delay(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + u32 delay; + + /* No time stamp */ + if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || + /* Discard delay samples right after fast recovery */ + (s32)(tcp_time_stamp - ca->epoch_start) < HZ) + return; + + delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + if (delay == 0) + delay = 1; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) + ca->delay_min = delay; +} + +static void bictcp_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int data_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + if (data_acked) + measure_delay(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + bictcp_update(ca, tp->snd_cwnd); + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= ca->cnt) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + +} + +static u32 bictcp_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + ca->epoch_start = 0; /* end of epoch */ + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 bictcp_undo_cwnd(struct sock *sk) +{ + struct bictcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); +} + +static u32 bictcp_min_cwnd(struct sock *sk) +{ + return tcp_sk(sk)->snd_ssthresh; +} + +static void bictcp_state(struct sock *sk, u8 new_state) +{ + if (new_state == TCP_CA_Loss) + bictcp_reset(inet_csk_ca(sk)); +} + +/* Track delayed acknowledgment ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void bictcp_acked(struct sock *sk, u32 cnt) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + struct bictcp *ca = inet_csk_ca(sk); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ca->delayed_ack += cnt; + } +} + + +static struct tcp_congestion_ops cubictcp = { + .init = bictcp_init, + .ssthresh = bictcp_recalc_ssthresh, + .cong_avoid = bictcp_cong_avoid, + .set_state = bictcp_state, + .undo_cwnd = bictcp_undo_cwnd, + .min_cwnd = bictcp_min_cwnd, + .pkts_acked = bictcp_acked, + .owner = THIS_MODULE, + .name = "cubic", +}; + +static int __init cubictcp_register(void) +{ + BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); + + /* Precompute a bunch of the scaling factors that are used per-packet + * based on SRTT of 100ms + */ + + beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); + + cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */ + + /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 + * so K = cubic_root( (wmax-cwnd)*rtt/c ) + * the unit of K is bictcp_HZ=2^10, not HZ + * + * c = bic_scale >> 10 + * rtt = 100ms + * + * the following code has been designed and tested for + * cwnd < 1 million packets + * RTT < 100 seconds + * HZ < 1,000,00 (corresponding to 10 nano-second) + */ + + /* 1/c * 2^2*bictcp_HZ * srtt */ + cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */ + + /* divide by bic_scale and by constant Srtt (100ms) */ + do_div(cube_factor, bic_scale * 10); + + return tcp_register_congestion_control(&cubictcp); +} + +static void __exit cubictcp_unregister(void) +{ + tcp_unregister_congestion_control(&cubictcp); +} + +module_init(cubictcp_register); +module_exit(cubictcp_unregister); + +MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("CUBIC TCP"); +MODULE_VERSION("2.0"); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf2e23086bce..a97ed5416c28 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static inline void tcp_measure_rcv_mss(struct sock *sk, - const struct sk_buff *skb) +static void tcp_measure_rcv_mss(struct sock *sk, + const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; @@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, return 0; } -static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) { /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && @@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); } + +/* Initialize RCV_MSS value. + * RCV_MSS is an our guess about MSS used by the peer. + * We haven't any direct information about the MSS. + * It's better to underestimate the RCV_MSS rather than overestimate. + * Overestimations make us ACKing less frequently than needed. + * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). + */ +void tcp_initialize_rcv_mss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); + + hint = min(hint, tp->rcv_wnd/2); + hint = min(hint, TCP_MIN_RCVMSS); + hint = max(hint, TCP_MIN_MSS); + + inet_csk(sk)->icsk_ack.rcv_mss = hint; +} + /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on @@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } +/* Set slow start threshold and cwnd not falling to slow start */ +void tcp_enter_cwr(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->prior_ssthresh = 0; + tp->bytes_acked = 0; + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + tp->undo_marker = 0; + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp) + 1U); + tp->snd_cwnd_cnt = 0; + tp->high_seq = tp->snd_nxt; + tp->snd_cwnd_stamp = tcp_time_stamp; + TCP_ECN_queue_cwr(tp); + + tcp_set_ca_state(sk, TCP_CA_CWR); + } +} + /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) @@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } -static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, - u32 in_flight, int good) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, + u32 in_flight, int good) { const struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); @@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, * RFC2988 recommends to restart timer to now+rto. */ -static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) +static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); @@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } -static inline u32 tcp_usrtt(const struct sk_buff *skb) +static u32 tcp_usrtt(const struct sk_buff *skb) { struct timeval tv, now; @@ -2342,7 +2383,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, if (nwin > tp->max_window) { tp->max_window = nwin; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); } } } @@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ -static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, - struct tcp_sock *tp) +static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, + struct tcp_sock *tp) { if (th->doff == sizeof(struct tcphdr)>>2) { tp->rx_opt.saw_tstamp = 0; @@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) } } -static __inline__ int -tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) +static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { if (before(seq, sp->start_seq)) @@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) return 0; } -static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { if (before(seq, tp->rcv_nxt)) @@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) } } -static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (!tp->rx_opt.dsack) tcp_dsack_set(tp, seq, end_seq); @@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } -static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) { __u32 tmp; @@ -3307,7 +3347,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, int offset = start - TCP_SKB_CB(skb)->seq; int size = TCP_SKB_CB(skb)->end_seq - start; - if (offset < 0) BUG(); + BUG_ON(offset < 0); if (size > 0) { size = min(copy, size); if (skb_copy_bits(skb, offset, skb_put(nskb, size), size)) @@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) { /* If the user specified a specific send buffer setting, do * not modify it. @@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk) sk->sk_write_space(sk); } -static inline void tcp_check_space(struct sock *sk) +static void tcp_check_space(struct sock *sk) { if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); @@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk) } } -static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) { tcp_push_pending_frames(sk, tp); tcp_check_space(sk); @@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) } } -static __inline__ void tcp_ack_snd_check(struct sock *sk) +static inline void tcp_ack_snd_check(struct sock *sk) { if (!inet_csk_ack_scheduled(sk)) { /* We sent a data segment already. */ @@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) return result; } -static __inline__ int -tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { return skb->ip_summed != CHECKSUM_UNNECESSARY && __tcp_checksum_complete_user(sk, skb); @@ -3967,12 +4006,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int saved_clamp = tp->rx_opt.mss_clamp; tcp_parse_options(skb, &tp->rx_opt, 0); if (th->ack) { - struct inet_connection_sock *icsk; /* rfc793: * "If the state is SYN-SENT then * first check the ACK bit @@ -4061,7 +4100,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.sack_ok && sysctl_tcp_fack) tp->rx_opt.sack_ok |= 2; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); /* Remember, tcp_poll() does not lock socket! @@ -4072,7 +4111,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_set_state(sk, TCP_ESTABLISHED); /* Make sure socket is routed, for correct metrics. */ - tp->af_specific->rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); @@ -4098,8 +4137,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, sk_wake_async(sk, 0, POLL_OUT); } - icsk = inet_csk(sk); - if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { @@ -4173,7 +4210,7 @@ discard: if (tp->ecn_flags&TCP_ECN_OK) sock_set_flag(sk, SOCK_NO_LARGESEND); - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -4220,6 +4257,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; tp->rx_opt.saw_tstamp = 0; @@ -4236,7 +4274,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; if(th->syn) { - if(tp->af_specific->conn_request(sk, skb) < 0) + if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; /* Now we have several options: In theory there is @@ -4349,7 +4387,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Make sure socket is routed, for * correct metrics. */ - tp->af_specific->rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); @@ -4475,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); +EXPORT_SYMBOL(tcp_initialize_rcv_mss); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4d5021e1929b..6ea353907af5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -69,6 +69,7 @@ #include <net/transp_v6.h> #include <net/ipv6.h> #include <net/inet_common.h> +#include <net/timewait_sock.h> #include <net/xfrm.h> #include <linux/inet.h> @@ -86,8 +87,7 @@ int sysctl_tcp_low_latency; /* Socket used for sending RSTs */ static struct socket *tcp_socket; -void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb); +void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, @@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { static int tcp_v4_get_port(struct sock *sk, unsigned short snum) { - return inet_csk_get_port(&tcp_hashinfo, sk, snum); + return inet_csk_get_port(&tcp_hashinfo, sk, snum, + inet_csk_bind_conflict); } static void tcp_v4_hash(struct sock *sk) @@ -118,202 +119,38 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) skb->h.th->source); } -/* called with local bh disabled */ -static int __tcp_v4_check_established(struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) +int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) { - struct inet_sock *inet = inet_sk(sk); - u32 daddr = inet->rcv_saddr; - u32 saddr = inet->daddr; - int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - tw = inet_twsk(sk2); - - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); - struct tcp_sock *tp = tcp_sk(sk); - - /* With PAWS, it is safe from the viewpoint - of data integrity. Even without PAWS it - is safe provided sequence spaces do not - overlap i.e. at data rates <= 80Mbit/sec. - - Actually, the idea is close to VJ's one, - only timestamp cache is held not per host, - but per port pair and TW bucket is used - as state holder. + const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); + struct tcp_sock *tp = tcp_sk(sk); - If TW bucket has been already destroyed we - fall back to VJ's scheme and use initial - timestamp retrieved from peer table. - */ - if (tcptw->tw_ts_recent_stamp && - (!twp || (sysctl_tcp_tw_reuse && - xtime.tv_sec - - tcptw->tw_ts_recent_stamp > 1))) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - sock_hold(sk2); - goto unique; - } else - goto not_unique; - } - } - tw = NULL; + /* With PAWS, it is safe from the viewpoint + of data integrity. Even without PAWS it is safe provided sequence + spaces do not overlap i.e. at data rates <= 80Mbit/sec. - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } + Actually, the idea is close to VJ's one, only timestamp cache is + held not per host, but per port pair and TW bucket is used as state + holder. -unique: - /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); - sk->sk_hash = hash; - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &tcp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); + If TW bucket has been already destroyed we fall back to VJ's scheme + and use initial timestamp retrieved from peer table. + */ + if (tcptw->tw_ts_recent_stamp && + (twp == NULL || (sysctl_tcp_tw_reuse && + xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + sock_hold(sktw); + return 1; } return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; } -static inline u32 connect_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - - return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, - inet->dport); -} - -/* - * Bind a port for a connect operation and hash it. - */ -static inline int tcp_v4_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + connect_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__tcp_v4_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __inet_hash(&tcp_hashinfo, sk, 0); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &tcp_death_row);; - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet_hash(&tcp_hashinfo, sk, 0); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __tcp_v4_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} +EXPORT_SYMBOL_GPL(tcp_twsk_unique); /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -383,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->dport = usin->sin_port; inet->daddr = daddr; - tp->ext_header_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt) - tp->ext_header_len = inet->opt->optlen; + inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; tp->rx_opt.mss_clamp = 536; @@ -395,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); - err = tcp_v4_hash_connect(sk); + err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; @@ -433,12 +270,10 @@ failure: /* * This routine does path mtu discovery as defined in RFC1191. */ -static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, - u32 mtu) +static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs * send out by Linux are always <576bytes so they should go through @@ -467,7 +302,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && - tp->pmtu_cookie > mtu) { + inet_csk(sk)->icsk_pmtu_cookie > mtu) { tcp_sync_mss(sk, mtu); /* Resend the TCP packet because it's @@ -644,10 +479,10 @@ out: } /* This routine computes an IPv4 TCP checksum. */ -void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb) +void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(sk); + struct tcphdr *th = skb->h.th; if (skb->ip_summed == CHECKSUM_HW) { th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); @@ -826,7 +661,8 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -static inline void syn_flood_warning(struct sk_buff *skb) +#ifdef CONFIG_SYN_COOKIES +static void syn_flood_warning(struct sk_buff *skb) { static unsigned long warntime; @@ -837,12 +673,13 @@ static inline void syn_flood_warning(struct sk_buff *skb) ntohs(skb->h.th->dest)); } } +#endif /* * Save and compile IPv4 options into the request_sock if needed. */ -static inline struct ip_options *tcp_v4_save_options(struct sock *sk, - struct sk_buff *skb) +static struct ip_options *tcp_v4_save_options(struct sock *sk, + struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); struct ip_options *dopt = NULL; @@ -869,6 +706,11 @@ struct request_sock_ops tcp_request_sock_ops = { .send_reset = tcp_v4_send_reset, }; +static struct timewait_sock_ops tcp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_unique = tcp_twsk_unique, +}; + int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct inet_request_sock *ireq; @@ -1053,9 +895,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ireq->opt = NULL; newinet->mc_index = inet_iif(skb); newinet->mc_ttl = skb->nh.iph->ttl; - newtp->ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newinet->opt) - newtp->ext_header_len = newinet->opt->optlen; + inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; newinet->id = newtp->write_seq ^ jiffies; tcp_sync_mss(newsk, dst_mtu(dst)); @@ -1238,6 +1080,7 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; + nf_reset(skb); if (sk_filter(sk, skb, 0)) goto discard_and_relse; @@ -1314,16 +1157,6 @@ do_time_wait: goto discard_it; } -static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) -{ - struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; - struct inet_sock *inet = inet_sk(sk); - - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->daddr; - sin->sin_port = inet->dport; -} - /* VJ's idea. Save last timestamp seen from this destination * and hold it at least for normal timewait interval to use for duplicate * segment detection in subsequent connections, before they enter synchronized @@ -1382,7 +1215,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) return 0; } -struct tcp_func ipv4_specific = { +struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1392,7 +1225,7 @@ struct tcp_func ipv4_specific = { .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .addr2sockaddr = v4_addr2sockaddr, + .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), }; @@ -1433,7 +1266,8 @@ static int tcp_v4_init_sock(struct sock *sk) sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); - tp->af_specific = &ipv4_specific; + icsk->icsk_af_ops = &ipv4_specific; + icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; @@ -1989,7 +1823,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), - .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1b66a2ac4321..2b9b7f6c7f7c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,18 +274,18 @@ kill: void tcp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; + const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); int recycle_ok = 0; if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) - recycle_ok = tp->af_specific->remember_stamp(sk); + recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - const struct inet_connection_sock *icsk = inet_csk(sk); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; @@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); + struct inet6_timewait_sock *tw6; - ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); - ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); tw->tw_ipv6only = np->ipv6only; } #endif @@ -456,7 +458,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, struct request_sock **prev) { struct tcphdr *th = skb->h.th; - struct tcp_sock *tp = tcp_sk(sk); u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; struct tcp_options_received tmp_opt; @@ -613,7 +614,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ - child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, + req, NULL); if (child == NULL) goto listen_overflow; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b7325e0b406a..a7623ead39a8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1; */ int sysctl_tcp_tso_win_divisor = 3; -static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void update_send_head(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) { sk->sk_send_head = skb->next; if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) @@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) tp->snd_cwnd_used = 0; } -static inline void tcp_event_data_sent(struct tcp_sock *tp, - struct sk_buff *skb, struct sock *sk) +static void tcp_event_data_sent(struct tcp_sock *tp, + struct sk_buff *skb, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; @@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp, icsk->icsk_ack.pingpong = 1; } -static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); @@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss, * value can be stuffed directly into th->window for an outgoing * frame. */ -static __inline__ u16 tcp_select_window(struct sock *sk) +static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 cur_win = tcp_receive_window(tp); @@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk) return new_win; } +static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, + __u32 tstamp) +{ + if (tp->rx_opt.tstamp_ok) { + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); + *ptr++ = htonl(tp->rx_opt.ts_recent); + } + if (tp->rx_opt.eff_sacks) { + struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; + int this_sack; + + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK))); + for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { + *ptr++ = htonl(sp[this_sack].start_seq); + *ptr++ = htonl(sp[this_sack].end_seq); + } + if (tp->rx_opt.dsack) { + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks--; + } + } +} + +/* Construct a tcp options header for a SYN or SYN_ACK packet. + * If this is every changed make sure to change the definition of + * MAX_SYN_SIZE to match the new maximum number of options that you + * can generate. + */ +static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, + int offer_wscale, int wscale, __u32 tstamp, + __u32 ts_recent) +{ + /* We always get an MSS option. + * The option bytes which will be seen in normal data + * packets should timestamps be used, must be in the MSS + * advertised. But we subtract them from tp->mss_cache so + * that calculations in tcp_sendmsg are simpler etc. + * So account for this fact here if necessary. If we + * don't do this correctly, as a receiver we won't + * recognize data packets as being full sized when we + * should, and thus we won't abide by the delayed ACK + * rules correctly. + * SACKs don't matter, we never delay an ACK when we + * have any of those going out. + */ + *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); + if (ts) { + if(sack) + *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + else + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); /* TSVAL */ + *ptr++ = htonl(ts_recent); /* TSECR */ + } else if(sack) + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + if (offer_wscale) + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); +} /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial @@ -371,7 +440,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ECN_send(sk, tp, skb, tcp_header_size); } - tp->af_specific->send_check(sk, th, skb->len, skb); + icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (likely(tcb->flags & TCPCB_FLAG_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); @@ -381,7 +450,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_INC_STATS(TCP_MIB_OUTSEGS); - err = tp->af_specific->queue_xmit(skb, 0); + err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (unlikely(err <= 0)) return err; @@ -621,7 +690,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) It is minimum of user_mss and mss received with SYN. It also does not include TCP options. - tp->pmtu_cookie is last pmtu, seen by this function. + inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, @@ -631,26 +700,26 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. - NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside - this function. --ANK (980731) + NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache + are READ ONLY outside this function. --ANK (980731) */ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); - int mss_now; - + struct inet_connection_sock *icsk = inet_csk(sk); /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ - mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); + int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - + sizeof(struct tcphdr)); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ - mss_now -= tp->ext_header_len; + mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) @@ -664,7 +733,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); /* And store cached results */ - tp->pmtu_cookie = pmtu; + icsk->icsk_pmtu_cookie = pmtu; tp->mss_cache = mss_now; return mss_now; @@ -694,7 +763,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) if (dst) { u32 mtu = dst_mtu(dst); - if (mtu != tp->pmtu_cookie) + if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } @@ -705,9 +774,10 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) xmit_size_goal = mss_now; if (doing_tso) { - xmit_size_goal = 65535 - - tp->af_specific->net_header_len - - tp->ext_header_len - tp->tcp_header_len; + xmit_size_goal = (65535 - + inet_csk(sk)->icsk_af_ops->net_header_len - + inet_csk(sk)->icsk_ext_hdr_len - + tp->tcp_header_len); if (tp->max_window && (xmit_size_goal > (tp->max_window >> 1))) @@ -723,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) /* Congestion window validation. (RFC2861) */ -static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) { __u32 packets_out = tp->packets_out; @@ -772,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk /* This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) +static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); @@ -1422,7 +1492,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); - if(tp->af_specific->rebuild_header(sk)) + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ /* Some Solaris stacks overoptimize and ignore the FIN on a @@ -1793,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* * Do all connect socket setups that can be done AF independent. */ -static inline void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 13e7e6e8df16..3b7403495052 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -330,6 +330,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, vegas->cntRTT = 0; vegas->minRTT = 0x7fffffff; } + /* Use normal slow start */ + else if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + } /* Extract info for Tcp socket info provided via netlink. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2422a5f7195d..00840474a449 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -86,6 +86,7 @@ #include <linux/module.h> #include <linux/socket.h> #include <linux/sockios.h> +#include <linux/igmp.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> @@ -846,20 +847,7 @@ out: csum_copy_err: UDP_INC_STATS_BH(UDP_MIB_INERRORS); - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } - - skb_free_datagram(sk, skb); + skb_kill_datagram(sk, skb, flags); if (noblock) return -EAGAIN; @@ -1001,6 +989,7 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) kfree_skb(skb); return -1; } + nf_reset(skb); if (up->encap_type) { /* @@ -1094,7 +1083,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, * Otherwise, csum completion requires chacksumming packet body, * including udp header and folding it to skb->csum. */ -static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, +static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, unsigned short ulen, u32 saddr, u32 daddr) { if (uh->check == 0) { @@ -1108,7 +1097,6 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, /* Probably, we should checksum udp header (it should be in cache * in any case) and data in tiny packets (< rx copybreak). */ - return 0; } /* @@ -1141,8 +1129,7 @@ int udp_rcv(struct sk_buff *skb) if (pskb_trim_rcsum(skb, ulen)) goto short_packet; - if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) - goto csum_error; + udp_checksum_init(skb, uh, ulen, saddr, daddr); if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); @@ -1163,6 +1150,7 @@ int udp_rcv(struct sk_buff *skb) if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; + nf_reset(skb); /* No socket. Drop packet silently, if checksum is wrong */ if (udp_checksum_complete(skb)) diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 2d3849c38a0f..850d919591d1 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -11,6 +11,8 @@ #include <linux/module.h> #include <linux/string.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> #include <net/inet_ecn.h> #include <net/ip.h> #include <net/xfrm.h> @@ -45,6 +47,23 @@ static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) return xfrm_parse_spi(skb, nexthdr, spi, seq); } +#ifdef CONFIG_NETFILTER +static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + + if (skb->dst == NULL) { + if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, + skb->dev)) + goto drop; + } + return dst_input(skb); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} +#endif + int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) { int err; @@ -137,6 +156,8 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state)); skb->sp->len += xfrm_nr; + nf_reset(skb); + if (decaps) { if (!(skb->dev->flags&IFF_LOOPBACK)) { dst_release(skb->dst); @@ -145,7 +166,17 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) netif_rx(skb); return 0; } else { +#ifdef CONFIG_NETFILTER + __skb_push(skb, skb->data - skb->nh.raw); + skb->nh.iph->tot_len = htons(skb->len); + ip_send_check(skb->nh.iph); + + NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL, + xfrm4_rcv_encap_finish); + return 0; +#else return -skb->nh.iph->protocol; +#endif } drop_unlock: diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 66620a95942a..d4df0ddd424b 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -8,8 +8,10 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/compiler.h> #include <linux/skbuff.h> #include <linux/spinlock.h> +#include <linux/netfilter_ipv4.h> #include <net/inet_ecn.h> #include <net/ip.h> #include <net/xfrm.h> @@ -95,7 +97,7 @@ out: return ret; } -int xfrm4_output(struct sk_buff *skb) +static int xfrm4_output_one(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; @@ -113,27 +115,33 @@ int xfrm4_output(struct sk_buff *skb) goto error_nolock; } - spin_lock_bh(&x->lock); - err = xfrm_state_check(x, skb); - if (err) - goto error; + do { + spin_lock_bh(&x->lock); + err = xfrm_state_check(x, skb); + if (err) + goto error; - xfrm4_encap(skb); + xfrm4_encap(skb); - err = x->type->output(x, skb); - if (err) - goto error; + err = x->type->output(x, skb); + if (err) + goto error; - x->curlft.bytes += skb->len; - x->curlft.packets++; + x->curlft.bytes += skb->len; + x->curlft.packets++; - spin_unlock_bh(&x->lock); + spin_unlock_bh(&x->lock); - if (!(skb->dst = dst_pop(dst))) { - err = -EHOSTUNREACH; - goto error_nolock; - } - err = NET_XMIT_BYPASS; + if (!(skb->dst = dst_pop(dst))) { + err = -EHOSTUNREACH; + goto error_nolock; + } + dst = skb->dst; + x = dst->xfrm; + } while (x && !x->props.mode); + + IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; + err = 0; out_exit: return err; @@ -143,3 +151,33 @@ error_nolock: kfree_skb(skb); goto out_exit; } + +int xfrm4_output_finish(struct sk_buff *skb) +{ + int err; + + while (likely((err = xfrm4_output_one(skb)) == 0)) { + nf_reset(skb); + + err = nf_hook(PF_INET, NF_IP_LOCAL_OUT, &skb, NULL, + skb->dst->dev, dst_output); + if (unlikely(err != 1)) + break; + + if (!skb->dst->xfrm) + return dst_output(skb); + + err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL, + skb->dst->dev, xfrm4_output_finish); + if (unlikely(err != 1)) + break; + } + + return err; +} + +int xfrm4_output(struct sk_buff *skb) +{ + return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev, + xfrm4_output_finish); +} diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index d23e07fc81fa..dbabf81a9b7b 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -42,6 +42,21 @@ __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, x->props.saddr = tmpl->saddr; if (x->props.saddr.a4 == 0) x->props.saddr.a4 = saddr->a4; + if (tmpl->mode && x->props.saddr.a4 == 0) { + struct rtable *rt; + struct flowi fl_tunnel = { + .nl_u = { + .ip4_u = { + .daddr = x->id.daddr.a4, + } + } + }; + if (!xfrm_dst_lookup((struct xfrm_dst **)&rt, + &fl_tunnel, AF_INET)) { + x->props.saddr.a4 = rt->rt_src; + dst_release(&rt->u.dst); + } + } x->props.mode = tmpl->mode; x->props.reqid = tmpl->reqid; x->props.family = AF_INET; diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 6460eec834b7..41877abd22e6 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -8,10 +8,11 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ - ip6_flowlabel.o ipv6_syms.o netfilter.o + ip6_flowlabel.o ipv6_syms.o inet6_connection_sock.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ xfrm6_output.o +ipv6-$(CONFIG_NETFILTER) += netfilter.o ipv6-objs += $(ipv6-y) obj-$(CONFIG_INET6_AH) += ah6.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a60585fd85ad..dfb4f145a139 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -58,6 +58,7 @@ #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif +#include <linux/capability.h> #include <linux/delay.h> #include <linux/notifier.h> #include <linux/string.h> @@ -1195,7 +1196,7 @@ struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device * int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) { const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; - const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); int sk_ipv6only = ipv6_only_sock(sk); @@ -1228,7 +1229,7 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) /* Gets referenced address, destroys ifaddr */ -void addrconf_dad_stop(struct inet6_ifaddr *ifp) +static void addrconf_dad_stop(struct inet6_ifaddr *ifp) { if (ifp->flags&IFA_F_PERMANENT) { spin_lock_bh(&ifp->lock); @@ -2643,7 +2644,7 @@ static int if6_seq_show(struct seq_file *seq, void *v) { struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; seq_printf(seq, - "%04x%04x%04x%04x%04x%04x%04x%04x %02x %02x %02x %02x %8s\n", + NIP6_FMT " %02x %02x %02x %02x %8s\n", NIP6(ifp->addr), ifp->idev->dev->ifindex, ifp->prefix_len, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d9546380fa04..064ffab82a9f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -22,6 +22,7 @@ #include <linux/module.h> +#include <linux/capability.h> #include <linux/config.h> #include <linux/errno.h> #include <linux/types.h> @@ -167,6 +168,7 @@ lookup_protocol: sk->sk_reuse = 1; inet = inet_sk(sk); + inet->is_icsk = INET_PROTOSW_ICSK & answer_flags; if (SOCK_RAW == sock->type) { inet->num = protocol; @@ -389,6 +391,8 @@ int inet6_destroy_sock(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(inet6_destroy_sock); + /* * This does both peername and sockname. */ @@ -431,7 +435,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; - int err = -EINVAL; switch(cmd) { @@ -450,16 +453,15 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFDSTADDR: return addrconf_set_dstaddr((void __user *) arg); default: - if (!sk->sk_prot->ioctl || - (err = sk->sk_prot->ioctl(sk, cmd, arg)) == -ENOIOCTLCMD) - return(dev_ioctl(cmd,(void __user *) arg)); - return err; + if (!sk->sk_prot->ioctl) + return -ENOIOCTLCMD; + return sk->sk_prot->ioctl(sk, cmd, arg); } /*NOTREACHED*/ return(0); } -struct proto_ops inet6_stream_ops = { +const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -480,7 +482,7 @@ struct proto_ops inet6_stream_ops = { .sendpage = tcp_sendpage }; -struct proto_ops inet6_dgram_ops = { +const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -508,7 +510,7 @@ static struct net_proto_family inet6_family_ops = { }; /* Same as inet6_dgram_ops, sans udp_poll. */ -static struct proto_ops inet6_sockraw_ops = { +static const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -609,17 +611,90 @@ inet6_unregister_protosw(struct inet_protosw *p) } } +int inet6_sk_rebuild_header(struct sock *sk) +{ + int err; + struct dst_entry *dst; + struct ipv6_pinfo *np = inet6_sk(sk); + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) { + sk->sk_route_caps = 0; + return err; + } + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + return 0; +} + +EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); + +int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + + if (np->rxopt.all) { + if ((opt->hop && (np->rxopt.bits.hopopts || + np->rxopt.bits.ohopopts)) || + ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && + np->rxopt.bits.rxflow) || + (opt->srcrt && (np->rxopt.bits.srcrt || + np->rxopt.bits.osrcrt)) || + ((opt->dst1 || opt->dst0) && + (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) + return 1; + } + return 0; +} + +EXPORT_SYMBOL_GPL(ipv6_opt_accepted); + int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) { if (ptr == NULL) return -EINVAL; - ptr[0] = __alloc_percpu(mibsize, mibalign); + ptr[0] = __alloc_percpu(mibsize); if (!ptr[0]) goto err0; - ptr[1] = __alloc_percpu(mibsize, mibalign); + ptr[1] = __alloc_percpu(mibsize); if (!ptr[1]) goto err1; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index f3629730eb15..c7932cb420a5 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -33,6 +33,7 @@ #include <linux/string.h> #include <net/icmp.h> #include <net/ipv6.h> +#include <net/protocol.h> #include <net/xfrm.h> #include <asm/scatterlist.h> @@ -331,8 +332,7 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!x) return; - NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/" - "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/" NIP6_FMT "\n", ntohl(ah->spi), NIP6(iph->daddr)); xfrm_state_put(x); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 6b7294047238..72bd08af2dfb 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -13,6 +13,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/capability.h> #include <linux/config.h> #include <linux/module.h> #include <linux/errno.h> @@ -531,9 +532,7 @@ static int ac6_seq_show(struct seq_file *seq, void *v) struct ac6_iter_state *state = ac6_seq_private(seq); seq_printf(seq, - "%-4d %-15s " - "%04x%04x%04x%04x%04x%04x%04x%04x " - "%5d\n", + "%-4d %-15s " NIP6_FMT " %5d\n", state->dev->ifindex, state->dev->name, NIP6(im->aca_addr), im->aca_users); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c4a3a993acb7..99a6eb23378b 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -13,6 +13,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/kernel.h> diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8bfbe9970793..7b5b94f13902 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -36,6 +36,7 @@ #include <linux/random.h> #include <net/icmp.h> #include <net/ipv6.h> +#include <net/protocol.h> #include <linux/icmpv6.h> static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) @@ -265,8 +266,7 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6); if (!x) return; - printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/" - "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/" NIP6_FMT "\n", ntohl(esph->spi), NIP6(iph->daddr)); xfrm_state_put(x); } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index be6faf311387..2a1e7e45b890 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -152,7 +152,7 @@ static struct tlvtype_proc tlvprocdestopt_lst[] = { {-1, NULL} }; -static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +static int ipv6_destopt_rcv(struct sk_buff **skbp) { struct sk_buff *skb = *skbp; struct inet6_skb_parm *opt = IP6CB(skb); @@ -169,7 +169,7 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp) if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) { skb->h.raw += ((skb->h.raw[1]+1)<<3); - *nhoffp = opt->dst1; + opt->nhoff = opt->dst1; return 1; } @@ -192,7 +192,7 @@ void __init ipv6_destopt_init(void) NONE header. No data in packet. ********************************/ -static int ipv6_nodata_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +static int ipv6_nodata_rcv(struct sk_buff **skbp) { struct sk_buff *skb = *skbp; @@ -215,7 +215,7 @@ void __init ipv6_nodata_init(void) Routing header. ********************************/ -static int ipv6_rthdr_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +static int ipv6_rthdr_rcv(struct sk_buff **skbp) { struct sk_buff *skb = *skbp; struct inet6_skb_parm *opt = IP6CB(skb); @@ -249,7 +249,7 @@ looped_back: skb->h.raw += (hdr->hdrlen + 1) << 3; opt->dst0 = opt->dst1; opt->dst1 = 0; - *nhoffp = (&hdr->nexthdr) - skb->nh.raw; + opt->nhoff = (&hdr->nexthdr) - skb->nh.raw; return 1; } @@ -413,6 +413,8 @@ ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr) return opt; } +EXPORT_SYMBOL_GPL(ipv6_invert_rthdr); + /********************************** Hop-by-hop options. **********************************/ @@ -485,9 +487,14 @@ static struct tlvtype_proc tlvprochopopt_lst[] = { int ipv6_parse_hopopts(struct sk_buff *skb, int nhoff) { - IP6CB(skb)->hop = sizeof(struct ipv6hdr); - if (ip6_parse_tlv(tlvprochopopt_lst, skb)) + struct inet6_skb_parm *opt = IP6CB(skb); + + opt->hop = sizeof(struct ipv6hdr); + if (ip6_parse_tlv(tlvprochopopt_lst, skb)) { + skb->h.raw += (skb->h.raw[1]+1)<<3; + opt->nhoff = sizeof(struct ipv6hdr); return sizeof(struct ipv6hdr); + } return -1; } @@ -579,6 +586,8 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) return opt2; } +EXPORT_SYMBOL_GPL(ipv6_dup_options); + static int ipv6_renew_option(void *ohdr, struct ipv6_opt_hdr __user *newopt, int newoptlen, int inherit, diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 6ec6a2b549bb..fcf883183cef 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -79,7 +79,7 @@ DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly; static DEFINE_PER_CPU(struct socket *, __icmpv6_socket) = NULL; #define icmpv6_socket __get_cpu_var(__icmpv6_socket) -static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp); +static int icmpv6_rcv(struct sk_buff **pskb); static struct inet6_protocol icmpv6_protocol = { .handler = icmpv6_rcv, @@ -581,7 +581,7 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info) * Handle icmp messages */ -static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +static int icmpv6_rcv(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct net_device *dev = skb->dev; @@ -607,7 +607,7 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, 0); if (__skb_checksum_complete(skb)) { - LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", + LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [" NIP6_FMT " > " NIP6_FMT "]\n", NIP6(*saddr), NIP6(*daddr)); goto discard_it; } diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c new file mode 100644 index 000000000000..f8f3a37a1494 --- /dev/null +++ b/net/ipv6/inet6_connection_sock.c @@ -0,0 +1,200 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET6 connection oriented protocols. + * + * Authors: See the TCPv6 sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/jhash.h> + +#include <net/addrconf.h> +#include <net/inet_connection_sock.h> +#include <net/inet_ecn.h> +#include <net/inet_hashtables.h> +#include <net/ip6_route.h> +#include <net/sock.h> +#include <net/inet6_connection_sock.h> + +int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) +{ + const struct sock *sk2; + const struct hlist_node *node; + + /* We must walk the whole port owner list in this case. -DaveM */ + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && + (!sk->sk_reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + + return node != NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); + +/* + * request_sock (formerly open request) hash tables. + */ +static u32 inet6_synq_hash(const struct in6_addr *raddr, const u16 rport, + const u32 rnd, const u16 synq_hsize) +{ + u32 a = raddr->s6_addr32[0]; + u32 b = raddr->s6_addr32[1]; + u32 c = raddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += rnd; + __jhash_mix(a, b, c); + + a += raddr->s6_addr32[3]; + b += (u32)rport; + __jhash_mix(a, b, c); + + return c & (synq_hsize - 1); +} + +struct request_sock *inet6_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, + const struct in6_addr *raddr, + const struct in6_addr *laddr, + const int iif) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport, + lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct inet6_request_sock *treq = inet6_rsk(req); + + if (inet_rsk(req)->rmt_port == rport && + req->rsk_ops->family == AF_INET6 && + ipv6_addr_equal(&treq->rmt_addr, raddr) && + ipv6_addr_equal(&treq->loc_addr, laddr) && + (!treq->iif || treq->iif == iif)) { + BUG_TRAP(req->sk == NULL); + *prevp = prev; + return req; + } + } + + return NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_search_req); + +void inet6_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned long timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr, + inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} + +EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); + +void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; + + sin6->sin6_family = AF_INET6; + ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); + sin6->sin6_port = inet_sk(sk)->dport; + /* We do not store received flowlabel for TCP */ + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (sk->sk_bound_dev_if && + ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = sk->sk_bound_dev_if; +} + +EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); + +int inet6_csk_xmit(struct sk_buff *skb, int ipfragok) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi fl; + struct dst_entry *dst; + struct in6_addr *final_p = NULL, final; + + memset(&fl, 0, sizeof(fl)); + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_sport = inet->sport; + fl.fl_ip_dport = inet->dport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + int err = ip6_dst_lookup(sk, &dst, &fl); + + if (err) { + sk->sk_err_soft = -err; + return err; + } + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_route_caps = 0; + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + skb->dst = dst_clone(dst); + + /* Restore final destination back after routing done */ + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + + return ip6_xmit(sk, skb, &fl, np->opt, 0); +} + +EXPORT_SYMBOL_GPL(inet6_csk_xmit); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 01d5f46d4e40..4154f3a8b6cf 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -5,7 +5,8 @@ * * Generic INET6 transport hashtables * - * Authors: Lotsa people, from code originally in tcp + * Authors: Lotsa people, from code originally in tcp, generalised here + * by Arnaldo Carvalho de Melo <acme@mandriva.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,12 +15,13 @@ */ #include <linux/config.h> - #include <linux/module.h> +#include <linux/random.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> +#include <net/ip.h> struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, @@ -79,3 +81,180 @@ struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, } EXPORT_SYMBOL_GPL(inet6_lookup); + +static int __inet6_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *daddr = &np->rcv_saddr; + const struct in6_addr *saddr = &np->daddr; + const int dif = sk->sk_bound_dev_if; + const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, + inet->dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + prefetch(head->chain.first); + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); + + tw = inet_twsk(sk2); + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk2->sk_family == PF_INET6 && + ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sk->sk_hash = hash; + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp != NULL) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw != NULL) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet6_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32, + np->daddr.s6_addr32, + inet->dport); +} + +int inet6_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (snum == 0) { + const int low = sysctl_local_port_range[0]; + const int high = sysctl_local_port_range[1]; + const int range = high - low; + int i, port; + static u32 hint; + const u32 offset = hint + inet6_sk_port_offset(sk); + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__inet6_check_established(death_row, + sk, port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __inet6_hash(hinfo, sk); + } + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, death_row); + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + + if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { + __inet6_hash(hinfo, sk); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __inet6_check_established(death_row, sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 1cf02765fb5c..4183c8dac7f6 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -9,6 +9,7 @@ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ +#include <linux/capability.h> #include <linux/config.h> #include <linux/errno.h> #include <linux/types.h> @@ -200,6 +201,8 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, u32 label) return NULL; } +EXPORT_SYMBOL_GPL(fl6_sock_lookup); + void fl6_free_socklist(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -626,9 +629,7 @@ static void ip6fl_fl_seq_show(struct seq_file *seq, struct ip6_flowlabel *fl) { while(fl) { seq_printf(seq, - "%05X %-1d %-6d %-6d %-6ld %-8ld " - "%02x%02x%02x%02x%02x%02x%02x%02x " - "%-4d\n", + "%05X %-1d %-6d %-6d %-6ld %-8ld " NIP6_FMT " %-4d\n", (unsigned)ntohl(fl->label), fl->share, (unsigned)fl->owner, @@ -644,8 +645,8 @@ static void ip6fl_fl_seq_show(struct seq_file *seq, struct ip6_flowlabel *fl) static int ip6fl_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) - seq_puts(seq, "Label S Owner Users Linger Expires " - "Dst Opt\n"); + seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-39s %s\n", + "Label", "S", "Owner", "Users", "Linger", "Expires", "Dst", "Opt"); else ip6fl_fl_seq_show(seq, v); return 0; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index a6026d2787d2..29f73592e68e 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -48,7 +48,7 @@ -static inline int ip6_rcv_finish( struct sk_buff *skb) +inline int ip6_rcv_finish( struct sk_buff *skb) { if (skb->dst == NULL) ip6_route_input(skb); @@ -97,6 +97,9 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; + skb->h.raw = (u8 *)(hdr + 1); + IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + pkt_len = ntohs(hdr->payload_len); /* pkt_len may be zero if Jumbo payload option is present */ @@ -111,8 +114,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt } if (hdr->nexthdr == NEXTHDR_HOP) { - skb->h.raw = (u8*)(hdr+1); - if (ipv6_parse_hopopts(skb, offsetof(struct ipv6hdr, nexthdr)) < 0) { + if (ipv6_parse_hopopts(skb, IP6CB(skb)->nhoff) < 0) { IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); return 0; } @@ -143,26 +145,15 @@ static inline int ip6_input_finish(struct sk_buff *skb) int nexthdr; u8 hash; - skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr); - /* * Parse extension headers */ - nexthdr = skb->nh.ipv6h->nexthdr; - nhoff = offsetof(struct ipv6hdr, nexthdr); - - /* Skip hop-by-hop options, they are already parsed. */ - if (nexthdr == NEXTHDR_HOP) { - nhoff = sizeof(struct ipv6hdr); - nexthdr = skb->h.raw[0]; - skb->h.raw += (skb->h.raw[1]+1)<<3; - } - rcu_read_lock(); resubmit: if (!pskb_pull(skb, skb->h.raw - skb->data)) goto discard; + nhoff = IP6CB(skb)->nhoff; nexthdr = skb->nh.raw[nhoff]; raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]); @@ -194,7 +185,7 @@ resubmit: !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard; - ret = ipprot->handler(&skb, &nhoff); + ret = ipprot->handler(&skb); if (ret > 0) goto resubmit; else if (ret == 0) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8523c76ebf76..efa3e72cfcfa 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -226,6 +226,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); ipv6_addr_copy(&hdr->daddr, first_hop); + skb->priority = sk->sk_priority; + mtu = dst_mtu(dst); if ((skb->len <= mtu) || ipfragok) { IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); @@ -775,6 +777,8 @@ out_err_release: return err; } +EXPORT_SYMBOL_GPL(ip6_dst_lookup); + static inline int ip6_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), @@ -1180,6 +1184,8 @@ int ip6_push_pending_frames(struct sock *sk) ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); ipv6_addr_copy(&hdr->daddr, final_dst); + skb->priority = sk->sk_priority; + skb->dst = dst_clone(&rt->u.dst); IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index e315d0f80af1..92ead3cf956b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -21,6 +21,7 @@ #include <linux/config.h> #include <linux/module.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> @@ -243,7 +244,7 @@ ip6_tnl_create(struct ip6_tnl_parm *p, struct ip6_tnl **pt) if (dev == NULL) return -ENOMEM; - t = dev->priv; + t = netdev_priv(dev); dev->init = ip6ip6_tnl_dev_init; t->parms = *p; @@ -308,7 +309,7 @@ ip6ip6_tnl_locate(struct ip6_tnl_parm *p, struct ip6_tnl **pt, int create) static void ip6ip6_tnl_dev_uninit(struct net_device *dev) { - struct ip6_tnl *t = dev->priv; + struct ip6_tnl *t = netdev_priv(dev); if (dev == ip6ip6_fb_tnl_dev) { write_lock_bh(&ip6ip6_lock); @@ -510,7 +511,7 @@ static inline void ip6ip6_ecn_decapsulate(struct ipv6hdr *outer_iph, **/ static int -ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +ip6ip6_rcv(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct ipv6hdr *ipv6h; @@ -623,7 +624,7 @@ ip6ip6_tnl_addr_conflict(struct ip6_tnl *t, struct ipv6hdr *hdr) static int ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { - struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->stat; struct ipv6hdr *ipv6h = skb->nh.ipv6h; struct ipv6_txoptions *opt = NULL; @@ -933,11 +934,11 @@ ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } if ((err = ip6ip6_tnl_locate(&p, &t, 0)) == -ENODEV) - t = (struct ip6_tnl *) dev->priv; + t = netdev_priv(dev); else if (err) break; } else - t = (struct ip6_tnl *) dev->priv; + t = netdev_priv(dev); memcpy(&p, &t->parms, sizeof (p)); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { @@ -955,7 +956,7 @@ ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } if (!create && dev != ip6ip6_fb_tnl_dev) { - t = (struct ip6_tnl *) dev->priv; + t = netdev_priv(dev); } if (!t && (err = ip6ip6_tnl_locate(&p, &t, create))) { break; @@ -991,12 +992,12 @@ ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) err = ip6ip6_tnl_locate(&p, &t, 0); if (err) break; - if (t == ip6ip6_fb_tnl_dev->priv) { + if (t == netdev_priv(ip6ip6_fb_tnl_dev)) { err = -EPERM; break; } } else { - t = (struct ip6_tnl *) dev->priv; + t = netdev_priv(dev); } err = unregister_netdevice(t->dev); break; @@ -1016,7 +1017,7 @@ ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) static struct net_device_stats * ip6ip6_tnl_get_stats(struct net_device *dev) { - return &(((struct ip6_tnl *) dev->priv)->stat); + return &(((struct ip6_tnl *)netdev_priv(dev))->stat); } /** @@ -1073,7 +1074,7 @@ static void ip6ip6_tnl_dev_setup(struct net_device *dev) static inline void ip6ip6_tnl_dev_init_gen(struct net_device *dev) { - struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + struct ip6_tnl *t = netdev_priv(dev); t->fl.proto = IPPROTO_IPV6; t->dev = dev; strcpy(t->parms.name, dev->name); @@ -1087,7 +1088,7 @@ ip6ip6_tnl_dev_init_gen(struct net_device *dev) static int ip6ip6_tnl_dev_init(struct net_device *dev) { - struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + struct ip6_tnl *t = netdev_priv(dev); ip6ip6_tnl_dev_init_gen(dev); ip6ip6_tnl_link_config(t); return 0; @@ -1103,7 +1104,7 @@ ip6ip6_tnl_dev_init(struct net_device *dev) static int ip6ip6_fb_tnl_dev_init(struct net_device *dev) { - struct ip6_tnl *t = dev->priv; + struct ip6_tnl *t = netdev_priv(dev); ip6ip6_tnl_dev_init_gen(dev); dev_hold(dev); tnls_wc[0] = t; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 55917fb17094..d511a884dad0 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -47,6 +47,7 @@ #include <linux/rtnetlink.h> #include <net/icmp.h> #include <net/ipv6.h> +#include <net/protocol.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> @@ -211,8 +212,7 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!x) return; - printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" - "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" NIP6_FMT "\n", spi, NIP6(iph->daddr)); xfrm_state_put(x); } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 3620718defe6..f7142ba519ab 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -26,6 +26,7 @@ */ #include <linux/module.h> +#include <linux/capability.h> #include <linux/config.h> #include <linux/errno.h> #include <linux/types.h> @@ -163,17 +164,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sk_refcnt_debug_dec(sk); if (sk->sk_protocol == IPPROTO_TCP) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); local_bh_disable(); sock_prot_dec_use(sk->sk_prot); sock_prot_inc_use(&tcp_prot); local_bh_enable(); sk->sk_prot = &tcp_prot; - tp->af_specific = &ipv4_specific; + icsk->icsk_af_ops = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { local_bh_disable(); sock_prot_dec_use(sk->sk_prot); @@ -317,14 +318,15 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, } retv = 0; - if (sk->sk_type == SOCK_STREAM) { + if (inet_sk(sk)->is_icsk) { if (opt) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - tp->ext_header_len = opt->opt_flen + opt->opt_nflen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len = + opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg(&np->opt, opt); @@ -380,14 +382,15 @@ sticky_done: goto done; update: retv = 0; - if (sk->sk_type == SOCK_STREAM) { + if (inet_sk(sk)->is_icsk) { if (opt) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - tp->ext_header_len = opt->opt_flen + opt->opt_nflen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len = + opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg(&np->opt, opt); @@ -547,7 +550,7 @@ done: retv = -ENOBUFS; break; } - gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL); + gsf = kmalloc(optlen,GFP_KERNEL); if (gsf == 0) { retv = -ENOBUFS; break; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index f829a4ad3ccc..0e03eabfb9da 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -224,7 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = MCAST_EXCLUDE; - mc_lst->sflock = RW_LOCK_UNLOCKED; + rwlock_init(&mc_lst->sflock); mc_lst->sflist = NULL; /* @@ -449,8 +449,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (psl) count += psl->sl_max; - newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk, - IP6_SFLSIZE(count), GFP_ATOMIC); + newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_ATOMIC); if (!newpsl) { err = -ENOBUFS; goto done; @@ -535,8 +534,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) goto done; } if (gsf->gf_numsrc) { - newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk, - IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC); + newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc), + GFP_ATOMIC); if (!newpsl) { err = -ENOBUFS; goto done; @@ -768,7 +767,7 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ - pmc = (struct ifmcaddr6 *)kmalloc(sizeof(*pmc), GFP_ATOMIC); + pmc = kmalloc(sizeof(*pmc), GFP_ATOMIC); if (!pmc) return; memset(pmc, 0, sizeof(*pmc)); @@ -1937,7 +1936,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, psf_prev = psf; } if (!psf) { - psf = (struct ip6_sf_list *)kmalloc(sizeof(*psf), GFP_ATOMIC); + psf = kmalloc(sizeof(*psf), GFP_ATOMIC); if (!psf) return -ENOBUFS; memset(psf, 0, sizeof(*psf)); @@ -2374,7 +2373,7 @@ static int igmp6_mc_seq_show(struct seq_file *seq, void *v) struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); seq_printf(seq, - "%-4d %-15s %04x%04x%04x%04x%04x%04x%04x%04x %5d %08X %ld\n", + "%-4d %-15s " NIP6_FMT " %5d %08X %ld\n", state->dev->ifindex, state->dev->name, NIP6(im->mca_addr), im->mca_users, im->mca_flags, @@ -2543,15 +2542,12 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_printf(seq, "%3s %6s " - "%32s %32s %6s %6s\n", "Idx", + "%39s %39s %6s %6s\n", "Idx", "Device", "Multicast Address", "Source Address", "INC", "EXC"); } else { seq_printf(seq, - "%3d %6.6s " - "%04x%04x%04x%04x%04x%04x%04x%04x " - "%04x%04x%04x%04x%04x%04x%04x%04x " - "%6lu %6lu\n", + "%3d %6.6s " NIP6_FMT " " NIP6_FMT " %6lu %6lu\n", state->dev->ifindex, state->dev->name, NIP6(state->im->mca_addr), NIP6(psf->sf_addr), diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 305d9ee6d7db..cb8856b1d951 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -692,7 +692,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) if (!(neigh->nud_state & NUD_VALID)) { ND_PRINTK1(KERN_DEBUG "%s(): trying to ucast probe in NUD_INVALID: " - "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + NIP6_FMT "\n", __FUNCTION__, NIP6(*target)); } diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index f8626ebf90fd..d750cfc019dc 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -1,15 +1,12 @@ -#include <linux/config.h> -#include <linux/init.h> - -#ifdef CONFIG_NETFILTER - #include <linux/kernel.h> +#include <linux/init.h> #include <linux/ipv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/dst.h> #include <net/ipv6.h> #include <net/ip6_route.h> +#include <net/xfrm.h> int ip6_route_me_harder(struct sk_buff *skb) { @@ -21,11 +18,17 @@ int ip6_route_me_harder(struct sk_buff *skb) { .ip6_u = { .daddr = iph->daddr, .saddr = iph->saddr, } }, - .proto = iph->nexthdr, }; dst = ip6_route_output(skb->sk, &fl); +#ifdef CONFIG_XFRM + if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + xfrm_decode_session(skb, &fl, AF_INET6) == 0) + if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0)) + return -1; +#endif + if (dst->error) { IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); @@ -87,18 +90,10 @@ int __init ipv6_netfilter_init(void) return nf_register_queue_rerouter(PF_INET6, &ip6_reroute); } +/* This can be called from inet6_init() on errors, so it cannot + * be marked __exit. -DaveM + */ void ipv6_netfilter_fini(void) { nf_unregister_queue_rerouter(PF_INET6); } - -#else /* CONFIG_NETFILTER */ -int __init ipv6_netfilter_init(void) -{ - return 0; -} - -void ipv6_netfilter_fini(void) -{ -} -#endif /* CONFIG_NETFILTER */ diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 04912f9b35c3..2d6f8ecbc27b 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -41,6 +41,7 @@ config IP6_NF_QUEUE config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering/masq/NAT)" + depends on NETFILTER_XTABLES help ip6tables is a general, extensible packet identification framework. Currently only the packet filtering and packet mangling subsystem @@ -50,25 +51,6 @@ config IP6_NF_IPTABLES To compile it as a module, choose M here. If unsure, say N. # The simple matches. -config IP6_NF_MATCH_LIMIT - tristate "limit match support" - depends on IP6_NF_IPTABLES - help - limit matching allows you to control the rate at which a rule can be - matched: mainly useful in combination with the LOG target ("LOG - target support", below) and to avoid some Denial of Service attacks. - - To compile it as a module, choose M here. If unsure, say N. - -config IP6_NF_MATCH_MAC - tristate "MAC address match support" - depends on IP6_NF_IPTABLES - help - mac matching allows you to match packets based on the source - Ethernet address of the packet. - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_MATCH_RT tristate "Routing header match support" depends on IP6_NF_IPTABLES @@ -124,16 +106,6 @@ config IP6_NF_MATCH_OWNER To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_MATCH_MARK - tristate "netfilter MARK match support" - depends on IP6_NF_IPTABLES - help - Netfilter mark matching allows you to match packets based on the - `nfmark' value in the packet. This can be set by the MARK target - (see below). - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_MATCH_IPV6HEADER tristate "IPv6 Extension Headers Match" depends on IP6_NF_IPTABLES @@ -151,15 +123,6 @@ config IP6_NF_MATCH_AHESP To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_MATCH_LENGTH - tristate "Packet Length match support" - depends on IP6_NF_IPTABLES - help - This option allows you to match the length of a packet against a - specific value or range of values. - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_MATCH_EUI64 tristate "EUI64 address check" depends on IP6_NF_IPTABLES @@ -170,12 +133,13 @@ config IP6_NF_MATCH_EUI64 To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_MATCH_PHYSDEV - tristate "Physdev match support" - depends on IP6_NF_IPTABLES && BRIDGE_NETFILTER +config IP6_NF_MATCH_POLICY + tristate "IPsec policy match support" + depends on IP6_NF_IPTABLES && XFRM help - Physdev packet matching matches against the physical bridge ports - the IP packet arrived on or will leave by. + Policy matching allows you to match packets based on the + IPsec policy that was used during decapsulation/will + be used during encapsulation. To compile it as a module, choose M here. If unsure, say N. @@ -209,17 +173,6 @@ config IP6_NF_TARGET_REJECT To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_TARGET_NFQUEUE - tristate "NFQUEUE Target Support" - depends on IP6_NF_IPTABLES - help - This Target replaced the old obsolete QUEUE target. - - As opposed to QUEUE, it supports 65535 different queues, - not just one. - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_MANGLE tristate "Packet mangling" depends on IP6_NF_IPTABLES @@ -230,19 +183,6 @@ config IP6_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_TARGET_MARK - tristate "MARK target support" - depends on IP6_NF_MANGLE - help - This option adds a `MARK' target, which allows you to create rules - in the `mangle' table which alter the netfilter mark (nfmark) field - associated with the packet packet prior to routing. This can change - the routing method (see `Use netfilter MARK value as routing - key') and can also be used by other subsystems to change their - behavior. - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_TARGET_HL tristate 'HL (hoplimit) target support' depends on IP6_NF_MANGLE diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 9ab5b2ca1f59..663b4749820d 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -4,24 +4,19 @@ # Link order matters here. obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o -obj-$(CONFIG_IP6_NF_MATCH_LIMIT) += ip6t_limit.o -obj-$(CONFIG_IP6_NF_MATCH_MARK) += ip6t_mark.o obj-$(CONFIG_IP6_NF_MATCH_LENGTH) += ip6t_length.o -obj-$(CONFIG_IP6_NF_MATCH_MAC) += ip6t_mac.o obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o ip6t_dst.o obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o obj-$(CONFIG_IP6_NF_MATCH_AHESP) += ip6t_esp.o ip6t_ah.o +obj-$(CONFIG_IP6_NF_MATCH_POLICY) += ip6t_policy.o obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o obj-$(CONFIG_IP6_NF_MATCH_MULTIPORT) += ip6t_multiport.o obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o -obj-$(CONFIG_IP6_NF_MATCH_PHYSDEV) += ip6t_physdev.o obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o -obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o -obj-$(CONFIG_IP6_NF_TARGET_NFQUEUE) += ip6t_NFQUEUE.o obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 95d469271c4d..847068fd3367 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -13,15 +13,19 @@ * a table * 06 Jun 2002 Andras Kis-Szabo <kisza@sch.bme.hu> * - new extension header parser code + * 15 Oct 2005 Harald Welte <laforge@netfilter.org> + * - Unification of {ip,ip6}_tables into x_tables + * - Removed tcp and udp code, since it's not ipv6 specific */ + +#include <linux/capability.h> #include <linux/config.h> +#include <linux/in.h> #include <linux/skbuff.h> #include <linux/kmod.h> #include <linux/vmalloc.h> #include <linux/netdevice.h> #include <linux/module.h> -#include <linux/tcp.h> -#include <linux/udp.h> #include <linux/icmpv6.h> #include <net/ipv6.h> #include <asm/uaccess.h> @@ -30,6 +34,7 @@ #include <linux/cpumask.h> #include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -64,13 +69,8 @@ do { \ #else #define IP_NF_ASSERT(x) #endif -#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) -static DECLARE_MUTEX(ip6t_mutex); -/* Must have mutex */ -#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) -#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) #include <linux/netfilter_ipv4/listhelp.h> #if 0 @@ -86,55 +86,22 @@ static DECLARE_MUTEX(ip6t_mutex); context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ -/* The table itself */ -struct ip6t_table_info -{ - /* Size per table */ - unsigned int size; - /* Number of entries: FIXME. --RR */ - unsigned int number; - /* Initial number of entries. Needed for module usage count */ - unsigned int initial_entries; - - /* Entry points and underflows */ - unsigned int hook_entry[NF_IP6_NUMHOOKS]; - unsigned int underflow[NF_IP6_NUMHOOKS]; - - /* ip6t_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; -}; - -static LIST_HEAD(ip6t_target); -static LIST_HEAD(ip6t_match); -static LIST_HEAD(ip6t_tables); -#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) - -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - #if 0 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) #define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0) #endif -static int ip6_masked_addrcmp(struct in6_addr addr1, struct in6_addr mask, - struct in6_addr addr2) +int +ip6_masked_addrcmp(const struct in6_addr *addr1, const struct in6_addr *mask, + const struct in6_addr *addr2) { int i; for( i = 0; i < 16; i++){ - if((addr1.s6_addr[i] & mask.s6_addr[i]) != - (addr2.s6_addr[i] & mask.s6_addr[i])) + if((addr1->s6_addr[i] & mask->s6_addr[i]) != + (addr2->s6_addr[i] & mask->s6_addr[i])) return 1; } return 0; @@ -168,10 +135,10 @@ ip6_packet_match(const struct sk_buff *skb, #define FWINV(bool,invflg) ((bool) ^ !!(ip6info->invflags & invflg)) - if (FWINV(ip6_masked_addrcmp(ipv6->saddr,ip6info->smsk,ip6info->src), - IP6T_INV_SRCIP) - || FWINV(ip6_masked_addrcmp(ipv6->daddr,ip6info->dmsk,ip6info->dst), - IP6T_INV_DSTIP)) { + if (FWINV(ip6_masked_addrcmp(&ipv6->saddr, &ip6info->smsk, + &ip6info->src), IP6T_INV_SRCIP) + || FWINV(ip6_masked_addrcmp(&ipv6->daddr, &ip6info->dmsk, + &ip6info->dst), IP6T_INV_DSTIP)) { dprintf("Source or dest mismatch.\n"); /* dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, @@ -214,69 +181,21 @@ ip6_packet_match(const struct sk_buff *skb, /* look for the desired protocol header */ if((ip6info->flags & IP6T_F_PROTO)) { - u_int8_t currenthdr = ipv6->nexthdr; - struct ipv6_opt_hdr _hdr, *hp; - u_int16_t ptr; /* Header offset in skb */ - u_int16_t hdrlen; /* Header */ - u_int16_t _fragoff = 0, *fp = NULL; - - ptr = IPV6_HDR_LEN; - - while (ip6t_ext_hdr(currenthdr)) { - /* Is there enough space for the next ext header? */ - if (skb->len - ptr < IPV6_OPTHDR_LEN) - return 0; - - /* NONE or ESP: there isn't protocol part */ - /* If we want to count these packets in '-p all', - * we will change the return 0 to 1*/ - if ((currenthdr == IPPROTO_NONE) || - (currenthdr == IPPROTO_ESP)) - break; - - hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); - BUG_ON(hp == NULL); - - /* Size calculation */ - if (currenthdr == IPPROTO_FRAGMENT) { - fp = skb_header_pointer(skb, - ptr+offsetof(struct frag_hdr, - frag_off), - sizeof(_fragoff), - &_fragoff); - if (fp == NULL) - return 0; - - _fragoff = ntohs(*fp) & ~0x7; - hdrlen = 8; - } else if (currenthdr == IPPROTO_AH) - hdrlen = (hp->hdrlen+2)<<2; - else - hdrlen = ipv6_optlen(hp); - - currenthdr = hp->nexthdr; - ptr += hdrlen; - /* ptr is too large */ - if ( ptr > skb->len ) - return 0; - if (_fragoff) { - if (ip6t_ext_hdr(currenthdr)) - return 0; - break; - } - } + int protohdr; + unsigned short _frag_off; - *protoff = ptr; - *fragoff = _fragoff; + protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off); + if (protohdr < 0) + return 0; - /* currenthdr contains the protocol header */ + *fragoff = _frag_off; dprintf("Packet protocol %hi ?= %s%hi.\n", - currenthdr, + protohdr, ip6info->invflags & IP6T_INV_PROTO ? "!":"", ip6info->proto); - if (ip6info->proto == currenthdr) { + if (ip6info->proto == protohdr) { if(ip6info->invflags & IP6T_INV_PROTO) { return 0; } @@ -351,7 +270,7 @@ ip6t_do_table(struct sk_buff **pskb, unsigned int hook, const struct net_device *in, const struct net_device *out, - struct ip6t_table *table, + struct xt_table *table, void *userdata) { static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); @@ -363,6 +282,7 @@ ip6t_do_table(struct sk_buff **pskb, const char *indev, *outdev; void *table_base; struct ip6t_entry *e, *back; + struct xt_table_info *private; /* Initialization */ indev = in ? in->name : nulldevname; @@ -375,10 +295,10 @@ ip6t_do_table(struct sk_buff **pskb, * match it. */ read_lock_bh(&table->lock); + private = table->private; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); - e = get_entry(table_base, table->private->hook_entry[hook]); + table_base = (void *)private->entries[smp_processor_id()]; + e = get_entry(table_base, private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG /* Check noone else using our table */ @@ -394,7 +314,7 @@ ip6t_do_table(struct sk_buff **pskb, #endif /* For return from builtin chain */ - back = get_entry(table_base, table->private->underflow[hook]); + back = get_entry(table_base, private->underflow[hook]); do { IP_NF_ASSERT(e); @@ -494,145 +414,6 @@ ip6t_do_table(struct sk_buff **pskb, #endif } -/* - * These are weird, but module loading must not be done with mutex - * held (since they will register), and we have to have a single - * function to use try_then_request_module(). - */ - -/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ -static inline struct ip6t_table *find_table_lock(const char *name) -{ - struct ip6t_table *t; - - if (down_interruptible(&ip6t_mutex) != 0) - return ERR_PTR(-EINTR); - - list_for_each_entry(t, &ip6t_tables, list) - if (strcmp(t->name, name) == 0 && try_module_get(t->me)) - return t; - up(&ip6t_mutex); - return NULL; -} - -/* Find match, grabs ref. Returns ERR_PTR() on error. */ -static inline struct ip6t_match *find_match(const char *name, u8 revision) -{ - struct ip6t_match *m; - int err = 0; - - if (down_interruptible(&ip6t_mutex) != 0) - return ERR_PTR(-EINTR); - - list_for_each_entry(m, &ip6t_match, list) { - if (strcmp(m->name, name) == 0) { - if (m->revision == revision) { - if (try_module_get(m->me)) { - up(&ip6t_mutex); - return m; - } - } else - err = -EPROTOTYPE; /* Found something. */ - } - } - up(&ip6t_mutex); - return ERR_PTR(err); -} - -/* Find target, grabs ref. Returns ERR_PTR() on error. */ -static inline struct ip6t_target *find_target(const char *name, u8 revision) -{ - struct ip6t_target *t; - int err = 0; - - if (down_interruptible(&ip6t_mutex) != 0) - return ERR_PTR(-EINTR); - - list_for_each_entry(t, &ip6t_target, list) { - if (strcmp(t->name, name) == 0) { - if (t->revision == revision) { - if (try_module_get(t->me)) { - up(&ip6t_mutex); - return t; - } - } else - err = -EPROTOTYPE; /* Found something. */ - } - } - up(&ip6t_mutex); - return ERR_PTR(err); -} - -struct ip6t_target *ip6t_find_target(const char *name, u8 revision) -{ - struct ip6t_target *target; - - target = try_then_request_module(find_target(name, revision), - "ip6t_%s", name); - if (IS_ERR(target) || !target) - return NULL; - return target; -} - -static int match_revfn(const char *name, u8 revision, int *bestp) -{ - struct ip6t_match *m; - int have_rev = 0; - - list_for_each_entry(m, &ip6t_match, list) { - if (strcmp(m->name, name) == 0) { - if (m->revision > *bestp) - *bestp = m->revision; - if (m->revision == revision) - have_rev = 1; - } - } - return have_rev; -} - -static int target_revfn(const char *name, u8 revision, int *bestp) -{ - struct ip6t_target *t; - int have_rev = 0; - - list_for_each_entry(t, &ip6t_target, list) { - if (strcmp(t->name, name) == 0) { - if (t->revision > *bestp) - *bestp = t->revision; - if (t->revision == revision) - have_rev = 1; - } - } - return have_rev; -} - -/* Returns true or fals (if no such extension at all) */ -static inline int find_revision(const char *name, u8 revision, - int (*revfn)(const char *, u8, int *), - int *err) -{ - int have_rev, best = -1; - - if (down_interruptible(&ip6t_mutex) != 0) { - *err = -EINTR; - return 1; - } - have_rev = revfn(name, revision, &best); - up(&ip6t_mutex); - - /* Nothing at all? Return 0 to try loading module. */ - if (best == -1) { - *err = -ENOENT; - return 0; - } - - *err = best; - if (!have_rev) - *err = -EPROTONOSUPPORT; - return 1; -} - - /* All zeroes == unconditional rule. */ static inline int unconditional(const struct ip6t_ip6 *ipv6) @@ -649,7 +430,8 @@ unconditional(const struct ip6t_ip6 *ipv6) /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct xt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -658,7 +440,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ip6t_entry *e - = (struct ip6t_entry *)(newinfo->entries + pos); + = (struct ip6t_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -708,13 +490,13 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) goto next; e = (struct ip6t_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ip6t_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -731,7 +513,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) newpos = pos + e->next_offset; } e = (struct ip6t_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -794,11 +576,11 @@ check_match(struct ip6t_entry_match *m, { struct ip6t_match *match; - match = try_then_request_module(find_match(m->u.user.name, - m->u.user.revision), + match = try_then_request_module(xt_find_match(AF_INET6, m->u.user.name, + m->u.user.revision), "ip6t_%s", m->u.user.name); if (IS_ERR(match) || !match) { - duprintf("check_match: `%s' not found\n", m->u.user.name); + duprintf("check_match: `%s' not found\n", m->u.user.name); return match ? PTR_ERR(match) : -ENOENT; } m->u.kernel.match = match; @@ -839,8 +621,9 @@ check_entry(struct ip6t_entry *e, const char *name, unsigned int size, goto cleanup_matches; t = ip6t_get_target(e); - target = try_then_request_module(find_target(t->u.user.name, - t->u.user.revision), + target = try_then_request_module(xt_find_target(AF_INET6, + t->u.user.name, + t->u.user.revision), "ip6t_%s", t->u.user.name); if (IS_ERR(target) || !target) { duprintf("check_entry: `%s' not found\n", t->u.user.name); @@ -876,7 +659,7 @@ check_entry(struct ip6t_entry *e, const char *name, unsigned int size, static inline int check_entry_size_and_hooks(struct ip6t_entry *e, - struct ip6t_table_info *newinfo, + struct xt_table_info *newinfo, unsigned char *base, unsigned char *limit, const unsigned int *hook_entries, @@ -910,7 +693,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, < 0 (not IP6T_RETURN). --RR */ /* Clear counters and comefrom */ - e->counters = ((struct ip6t_counters) { 0, 0 }); + e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; (*i)++; @@ -940,7 +723,8 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i) static int translate_table(const char *name, unsigned int valid_hooks, - struct ip6t_table_info *newinfo, + struct xt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -961,11 +745,11 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -993,95 +777,79 @@ translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IP6T_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; } -static struct ip6t_table_info * -replace_table(struct ip6t_table *table, - unsigned int num_counters, - struct ip6t_table_info *newinfo, - int *error) +/* Gets counters. */ +static inline int +add_entry_to_counter(const struct ip6t_entry *e, + struct xt_counters total[], + unsigned int *i) { - struct ip6t_table_info *oldinfo; - -#ifdef CONFIG_NETFILTER_DEBUG - { - struct ip6t_entry *table_base; - unsigned int i; - - for_each_cpu(i) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; - } - } -#endif - - /* Do the substitution. */ - write_lock_bh(&table->lock); - /* Check inside lock: is the old number correct? */ - if (num_counters != table->private->number) { - duprintf("num_counters != table->private->number (%u/%u)\n", - num_counters, table->private->number); - write_unlock_bh(&table->lock); - *error = -EAGAIN; - return NULL; - } - oldinfo = table->private; - table->private = newinfo; - newinfo->initial_entries = oldinfo->initial_entries; - write_unlock_bh(&table->lock); + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - return oldinfo; + (*i)++; + return 0; } -/* Gets counters. */ static inline int -add_entry_to_counter(const struct ip6t_entry *e, +set_entry_to_counter(const struct ip6t_entry *e, struct ip6t_counters total[], unsigned int *i) { - ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); (*i)++; return 0; } static void -get_counters(const struct ip6t_table_info *t, - struct ip6t_counters counters[]) +get_counters(const struct xt_table_info *t, + struct xt_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + IP6T_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IP6T_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -1091,31 +859,33 @@ get_counters(const struct ip6t_table_info *t, static int copy_entries_to_user(unsigned int total_size, - struct ip6t_table *table, + struct xt_table *table, void __user *userptr) { unsigned int off, num, countersize; struct ip6t_entry *e; - struct ip6t_counters *counters; + struct xt_counters *counters; + struct xt_table_info *private = table->private; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ - countersize = sizeof(struct ip6t_counters) * table->private->number; + countersize = sizeof(struct xt_counters) * private->number; counters = vmalloc(countersize); if (counters == NULL) return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); - get_counters(table->private, counters); + get_counters(private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* choose the copy that is on ourc node/cpu */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1127,7 +897,7 @@ copy_entries_to_user(unsigned int total_size, struct ip6t_entry_match *m; struct ip6t_entry_target *t; - e = (struct ip6t_entry *)(table->private->entries + off); + e = (struct ip6t_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ip6t_entry, counters), &counters[num], @@ -1173,23 +943,22 @@ get_entries(const struct ip6t_get_entries *entries, struct ip6t_get_entries __user *uptr) { int ret; - struct ip6t_table *t; + struct xt_table *t; - t = find_table_lock(entries->name); + t = xt_find_table_lock(AF_INET6, entries->name); if (t && !IS_ERR(t)) { - duprintf("t->private->number = %u\n", - t->private->number); - if (entries->size == t->private->size) - ret = copy_entries_to_user(t->private->size, + struct xt_table_info *private = t->private; + duprintf("t->private->number = %u\n", private->number); + if (entries->size == private->size) + ret = copy_entries_to_user(private->size, t, uptr->entrytable); else { duprintf("get_entries: I've got %u not %u!\n", - t->private->size, - entries->size); + private->size, entries->size); ret = -EINVAL; } module_put(t->me); - up(&ip6t_mutex); + xt_table_unlock(t); } else ret = t ? PTR_ERR(t) : -ENOENT; @@ -1201,45 +970,41 @@ do_replace(void __user *user, unsigned int len) { int ret; struct ip6t_replace tmp; - struct ip6t_table *t; - struct ip6t_table_info *newinfo, *oldinfo; - struct ip6t_counters *counters; + struct xt_table *t; + struct xt_table_info *newinfo, *oldinfo; + struct xt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; - /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) - return -ENOMEM; - - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } - counters = vmalloc(tmp.num_counters * sizeof(struct ip6t_counters)); + counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; duprintf("ip_tables: Translated table\n"); - t = try_then_request_module(find_table_lock(tmp.name), + t = try_then_request_module(xt_find_table_lock(AF_INET6, tmp.name), "ip6table_%s", tmp.name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; @@ -1254,7 +1019,7 @@ do_replace(void __user *user, unsigned int len) goto put_module; } - oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret); if (!oldinfo) goto put_module; @@ -1271,24 +1036,25 @@ do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + xt_free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, - sizeof(struct ip6t_counters) * tmp.num_counters) != 0) + sizeof(struct xt_counters) * tmp.num_counters) != 0) ret = -EFAULT; vfree(counters); - up(&ip6t_mutex); + xt_table_unlock(t); return ret; put_module: module_put(t->me); - up(&ip6t_mutex); + xt_table_unlock(t); free_newinfo_counters_untrans: - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + xt_free_table_info(newinfo); return ret; } @@ -1296,7 +1062,7 @@ do_replace(void __user *user, unsigned int len) * and everything is OK. */ static inline int add_counter_to_entry(struct ip6t_entry *e, - const struct ip6t_counters addme[], + const struct xt_counters addme[], unsigned int *i) { #if 0 @@ -1318,14 +1084,16 @@ static int do_add_counters(void __user *user, unsigned int len) { unsigned int i; - struct ip6t_counters_info tmp, *paddc; - struct ip6t_table *t; + struct xt_counters_info tmp, *paddc; + struct xt_table_info *private; + struct xt_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; - if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ip6t_counters)) + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters)) return -EINVAL; paddc = vmalloc(len); @@ -1337,27 +1105,30 @@ do_add_counters(void __user *user, unsigned int len) goto free; } - t = find_table_lock(tmp.name); + t = xt_find_table_lock(AF_INET6, tmp.name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; } write_lock_bh(&t->lock); - if (t->private->number != paddc->num_counters) { + private = t->private; + if (private->number != paddc->num_counters) { ret = -EINVAL; goto unlock_up_free; } i = 0; - IP6T_ENTRY_ITERATE(t->private->entries, - t->private->size, + /* Choose the copy that is on our node */ + loc_cpu_entry = private->entries[smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, + private->size, add_counter_to_entry, paddc->counters, &i); unlock_up_free: write_unlock_bh(&t->lock); - up(&ip6t_mutex); + xt_table_unlock(t); module_put(t->me); free: vfree(paddc); @@ -1401,7 +1172,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) switch (cmd) { case IP6T_SO_GET_INFO: { char name[IP6T_TABLE_MAXNAMELEN]; - struct ip6t_table *t; + struct xt_table *t; if (*len != sizeof(struct ip6t_getinfo)) { duprintf("length %u != %u\n", *len, @@ -1416,25 +1187,26 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } name[IP6T_TABLE_MAXNAMELEN-1] = '\0'; - t = try_then_request_module(find_table_lock(name), + t = try_then_request_module(xt_find_table_lock(AF_INET6, name), "ip6table_%s", name); if (t && !IS_ERR(t)) { struct ip6t_getinfo info; + struct xt_table_info *private = t->private; info.valid_hooks = t->valid_hooks; - memcpy(info.hook_entry, t->private->hook_entry, + memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); - memcpy(info.underflow, t->private->underflow, + memcpy(info.underflow, private->underflow, sizeof(info.underflow)); - info.num_entries = t->private->number; - info.size = t->private->size; + info.num_entries = private->number; + info.size = private->size; memcpy(info.name, name, sizeof(info.name)); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; else ret = 0; - up(&ip6t_mutex); + xt_table_unlock(t); module_put(t->me); } else ret = t ? PTR_ERR(t) : -ENOENT; @@ -1461,7 +1233,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP6T_SO_GET_REVISION_MATCH: case IP6T_SO_GET_REVISION_TARGET: { struct ip6t_get_revision rev; - int (*revfn)(const char *, u8, int *); + int target; if (*len != sizeof(rev)) { ret = -EINVAL; @@ -1473,12 +1245,13 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } if (cmd == IP6T_SO_GET_REVISION_TARGET) - revfn = target_revfn; + target = 1; else - revfn = match_revfn; + target = 0; - try_then_request_module(find_revision(rev.name, rev.revision, - revfn, &ret), + try_then_request_module(xt_find_revision(AF_INET6, rev.name, + rev.revision, + target, &ret), "ip6t_%s", rev.name); break; } @@ -1491,308 +1264,52 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -/* Registration hooks for targets. */ -int -ip6t_register_target(struct ip6t_target *target) -{ - int ret; - - ret = down_interruptible(&ip6t_mutex); - if (ret != 0) - return ret; - list_add(&target->list, &ip6t_target); - up(&ip6t_mutex); - return ret; -} - -void -ip6t_unregister_target(struct ip6t_target *target) -{ - down(&ip6t_mutex); - LIST_DELETE(&ip6t_target, target); - up(&ip6t_mutex); -} - -int -ip6t_register_match(struct ip6t_match *match) -{ - int ret; - - ret = down_interruptible(&ip6t_mutex); - if (ret != 0) - return ret; - - list_add(&match->list, &ip6t_match); - up(&ip6t_mutex); - - return ret; -} - -void -ip6t_unregister_match(struct ip6t_match *match) -{ - down(&ip6t_mutex); - LIST_DELETE(&ip6t_match, match); - up(&ip6t_mutex); -} - -int ip6t_register_table(struct ip6t_table *table, +int ip6t_register_table(struct xt_table *table, const struct ip6t_replace *repl) { int ret; - struct ip6t_table_info *newinfo; - static struct ip6t_table_info bootstrap + struct xt_table_info *newinfo; + static struct xt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = xt_alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - memcpy(newinfo->entries, repl->entries, repl->size); + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + xt_free_table_info(newinfo); return ret; } - ret = down_interruptible(&ip6t_mutex); - if (ret != 0) { - vfree(newinfo); + if (xt_register_table(table, &bootstrap, newinfo) != 0) { + xt_free_table_info(newinfo); return ret; } - /* Don't autoload: we'd eat our tail... */ - if (list_named_find(&ip6t_tables, table->name)) { - ret = -EEXIST; - goto free_unlock; - } - - /* Simplifies replace_table code. */ - table->private = &bootstrap; - if (!replace_table(table, 0, newinfo, &ret)) - goto free_unlock; - - duprintf("table->private->number = %u\n", - table->private->number); - - /* save number of initial entries */ - table->private->initial_entries = table->private->number; - - rwlock_init(&table->lock); - list_prepend(&ip6t_tables, table); - - unlock: - up(&ip6t_mutex); - return ret; - - free_unlock: - vfree(newinfo); - goto unlock; -} - -void ip6t_unregister_table(struct ip6t_table *table) -{ - down(&ip6t_mutex); - LIST_DELETE(&ip6t_tables, table); - up(&ip6t_mutex); - - /* Decrease module usage counts and free resources */ - IP6T_ENTRY_ITERATE(table->private->entries, table->private->size, - cleanup_entry, NULL); - vfree(table->private); -} - -/* Returns 1 if the port is matched by the range, 0 otherwise */ -static inline int -port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) -{ - int ret; - - ret = (port >= min && port <= max) ^ invert; - return ret; -} - -static int -tcp_find_option(u_int8_t option, - const struct sk_buff *skb, - unsigned int tcpoff, - unsigned int optlen, - int invert, - int *hotdrop) -{ - /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ - u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; - unsigned int i; - - duprintf("tcp_match: finding option\n"); - if (!optlen) - return invert; - /* If we don't have the whole header, drop packet. */ - op = skb_header_pointer(skb, tcpoff + sizeof(struct tcphdr), optlen, - _opt); - if (op == NULL) { - *hotdrop = 1; - return 0; - } - - for (i = 0; i < optlen; ) { - if (op[i] == option) return !invert; - if (op[i] < 2) i++; - else i += op[i+1]?:1; - } - - return invert; -} - -static int -tcp_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) -{ - struct tcphdr _tcph, *th; - const struct ip6t_tcp *tcpinfo = matchinfo; - - if (offset) { - /* To quote Alan: - - Don't allow a fragment of TCP 8 bytes in. Nobody normal - causes this. Its a cracker trying to break in by doing a - flag overwrite to pass the direction checks. - */ - if (offset == 1) { - duprintf("Dropping evil TCP offset=1 frag.\n"); - *hotdrop = 1; - } - /* Must not be a fragment. */ - return 0; - } - -#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) - - th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); - if (th == NULL) { - /* We've been asked to examine this packet, and we - can't. Hence, no choice but to drop. */ - duprintf("Dropping evil TCP offset=0 tinygram.\n"); - *hotdrop = 1; - return 0; - } - - if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], - ntohs(th->source), - !!(tcpinfo->invflags & IP6T_TCP_INV_SRCPT))) - return 0; - if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], - ntohs(th->dest), - !!(tcpinfo->invflags & IP6T_TCP_INV_DSTPT))) - return 0; - if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) - == tcpinfo->flg_cmp, - IP6T_TCP_INV_FLAGS)) - return 0; - if (tcpinfo->option) { - if (th->doff * 4 < sizeof(_tcph)) { - *hotdrop = 1; - return 0; - } - if (!tcp_find_option(tcpinfo->option, skb, protoff, - th->doff*4 - sizeof(*th), - tcpinfo->invflags & IP6T_TCP_INV_OPTION, - hotdrop)) - return 0; - } - return 1; -} - -/* Called when user tries to insert an entry of this type. */ -static int -tcp_checkentry(const char *tablename, - const struct ip6t_ip6 *ipv6, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) -{ - const struct ip6t_tcp *tcpinfo = matchinfo; - - /* Must specify proto == TCP, and no unknown invflags */ - return ipv6->proto == IPPROTO_TCP - && !(ipv6->invflags & IP6T_INV_PROTO) - && matchsize == IP6T_ALIGN(sizeof(struct ip6t_tcp)) - && !(tcpinfo->invflags & ~IP6T_TCP_INV_MASK); + return 0; } -static int -udp_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +void ip6t_unregister_table(struct xt_table *table) { - struct udphdr _udph, *uh; - const struct ip6t_udp *udpinfo = matchinfo; + struct xt_table_info *private; + void *loc_cpu_entry; - /* Must not be a fragment. */ - if (offset) - return 0; - - uh = skb_header_pointer(skb, protoff, sizeof(_udph), &_udph); - if (uh == NULL) { - /* We've been asked to examine this packet, and we - can't. Hence, no choice but to drop. */ - duprintf("Dropping evil UDP tinygram.\n"); - *hotdrop = 1; - return 0; - } - - return port_match(udpinfo->spts[0], udpinfo->spts[1], - ntohs(uh->source), - !!(udpinfo->invflags & IP6T_UDP_INV_SRCPT)) - && port_match(udpinfo->dpts[0], udpinfo->dpts[1], - ntohs(uh->dest), - !!(udpinfo->invflags & IP6T_UDP_INV_DSTPT)); -} - -/* Called when user tries to insert an entry of this type. */ -static int -udp_checkentry(const char *tablename, - const struct ip6t_ip6 *ipv6, - void *matchinfo, - unsigned int matchinfosize, - unsigned int hook_mask) -{ - const struct ip6t_udp *udpinfo = matchinfo; + private = xt_unregister_table(table); - /* Must specify proto == UDP, and no unknown invflags */ - if (ipv6->proto != IPPROTO_UDP || (ipv6->invflags & IP6T_INV_PROTO)) { - duprintf("ip6t_udp: Protocol %u != %u\n", ipv6->proto, - IPPROTO_UDP); - return 0; - } - if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_udp))) { - duprintf("ip6t_udp: matchsize %u != %u\n", - matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_udp))); - return 0; - } - if (udpinfo->invflags & ~IP6T_UDP_INV_MASK) { - duprintf("ip6t_udp: unknown flags %X\n", - udpinfo->invflags); - return 0; - } - - return 1; + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + xt_free_table_info(private); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ @@ -1840,11 +1357,12 @@ icmp6_match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int icmp6_checkentry(const char *tablename, - const struct ip6t_ip6 *ipv6, + const void *entry, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { + const struct ip6t_ip6 *ipv6 = entry; const struct ip6t_icmp *icmpinfo = matchinfo; /* Must specify proto == ICMP, and no unknown invflags */ @@ -1874,187 +1392,78 @@ static struct nf_sockopt_ops ip6t_sockopts = { .get = do_ip6t_get_ctl, }; -static struct ip6t_match tcp_matchstruct = { - .name = "tcp", - .match = &tcp_match, - .checkentry = &tcp_checkentry, -}; - -static struct ip6t_match udp_matchstruct = { - .name = "udp", - .match = &udp_match, - .checkentry = &udp_checkentry, -}; - static struct ip6t_match icmp6_matchstruct = { .name = "icmp6", .match = &icmp6_match, .checkentry = &icmp6_checkentry, }; -#ifdef CONFIG_PROC_FS -static inline int print_name(const char *i, - off_t start_offset, char *buffer, int length, - off_t *pos, unsigned int *count) -{ - if ((*count)++ >= start_offset) { - unsigned int namelen; - - namelen = sprintf(buffer + *pos, "%s\n", - i + sizeof(struct list_head)); - if (*pos + namelen > length) { - /* Stop iterating */ - return 1; - } - *pos += namelen; - } - return 0; -} - -static inline int print_target(const struct ip6t_target *t, - off_t start_offset, char *buffer, int length, - off_t *pos, unsigned int *count) -{ - if (t == &ip6t_standard_target || t == &ip6t_error_target) - return 0; - return print_name((char *)t, start_offset, buffer, length, pos, count); -} - -static int ip6t_get_tables(char *buffer, char **start, off_t offset, int length) -{ - off_t pos = 0; - unsigned int count = 0; - - if (down_interruptible(&ip6t_mutex) != 0) - return 0; - - LIST_FIND(&ip6t_tables, print_name, char *, - offset, buffer, length, &pos, &count); - - up(&ip6t_mutex); - - /* `start' hack - see fs/proc/generic.c line ~105 */ - *start=(char *)((unsigned long)count-offset); - return pos; -} - -static int ip6t_get_targets(char *buffer, char **start, off_t offset, int length) -{ - off_t pos = 0; - unsigned int count = 0; - - if (down_interruptible(&ip6t_mutex) != 0) - return 0; - - LIST_FIND(&ip6t_target, print_target, struct ip6t_target *, - offset, buffer, length, &pos, &count); - - up(&ip6t_mutex); - - *start = (char *)((unsigned long)count - offset); - return pos; -} - -static int ip6t_get_matches(char *buffer, char **start, off_t offset, int length) -{ - off_t pos = 0; - unsigned int count = 0; - - if (down_interruptible(&ip6t_mutex) != 0) - return 0; - - LIST_FIND(&ip6t_match, print_name, char *, - offset, buffer, length, &pos, &count); - - up(&ip6t_mutex); - - *start = (char *)((unsigned long)count - offset); - return pos; -} - -static const struct { char *name; get_info_t *get_info; } ip6t_proc_entry[] = -{ { "ip6_tables_names", ip6t_get_tables }, - { "ip6_tables_targets", ip6t_get_targets }, - { "ip6_tables_matches", ip6t_get_matches }, - { NULL, NULL} }; -#endif /*CONFIG_PROC_FS*/ - static int __init init(void) { int ret; + xt_proto_init(AF_INET6); + /* Noone else will be downing sem now, so we won't sleep */ - down(&ip6t_mutex); - list_append(&ip6t_target, &ip6t_standard_target); - list_append(&ip6t_target, &ip6t_error_target); - list_append(&ip6t_match, &tcp_matchstruct); - list_append(&ip6t_match, &udp_matchstruct); - list_append(&ip6t_match, &icmp6_matchstruct); - up(&ip6t_mutex); + xt_register_target(AF_INET6, &ip6t_standard_target); + xt_register_target(AF_INET6, &ip6t_error_target); + xt_register_match(AF_INET6, &icmp6_matchstruct); /* Register setsockopt */ ret = nf_register_sockopt(&ip6t_sockopts); if (ret < 0) { duprintf("Unable to register sockopts.\n"); + xt_proto_fini(AF_INET6); return ret; } -#ifdef CONFIG_PROC_FS - { - struct proc_dir_entry *proc; - int i; - - for (i = 0; ip6t_proc_entry[i].name; i++) { - proc = proc_net_create(ip6t_proc_entry[i].name, 0, - ip6t_proc_entry[i].get_info); - if (!proc) { - while (--i >= 0) - proc_net_remove(ip6t_proc_entry[i].name); - nf_unregister_sockopt(&ip6t_sockopts); - return -ENOMEM; - } - proc->owner = THIS_MODULE; - } - } -#endif - - printk("ip6_tables: (C) 2000-2002 Netfilter core team\n"); + printk("ip6_tables: (C) 2000-2006 Netfilter Core Team\n"); return 0; } static void __exit fini(void) { nf_unregister_sockopt(&ip6t_sockopts); -#ifdef CONFIG_PROC_FS - { - int i; - for (i = 0; ip6t_proc_entry[i].name; i++) - proc_net_remove(ip6t_proc_entry[i].name); - } -#endif + xt_unregister_match(AF_INET6, &icmp6_matchstruct); + xt_unregister_target(AF_INET6, &ip6t_error_target); + xt_unregister_target(AF_INET6, &ip6t_standard_target); + xt_proto_fini(AF_INET6); } /* - * find specified header up to transport protocol header. - * If found target header, the offset to the header is set to *offset - * and return 0. otherwise, return -1. + * find the offset to specified header or the protocol number of last header + * if target < 0. "last header" is transport protocol header, ESP, or + * "No next header". + * + * If target header is found, its offset is set in *offset and return protocol + * number. Otherwise, return -1. + * + * Note that non-1st fragment is special case that "the protocol number + * of last header" is "next header" field in Fragment header. In this case, + * *offset is meaningless and fragment offset is stored in *fragoff if fragoff + * isn't NULL. * - * Notes: - non-1st Fragment Header isn't skipped. - * - ESP header isn't skipped. - * - The target header may be trancated. */ -int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, u8 target) +int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, + int target, unsigned short *fragoff) { unsigned int start = (u8*)(skb->nh.ipv6h + 1) - skb->data; u8 nexthdr = skb->nh.ipv6h->nexthdr; unsigned int len = skb->len - start; + if (fragoff) + *fragoff = 0; + while (nexthdr != target) { struct ipv6_opt_hdr _hdr, *hp; unsigned int hdrlen; - if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) + if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { + if (target < 0) + break; return -1; + } + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (hp == NULL) return -1; @@ -2068,8 +1477,17 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, u8 target) if (fp == NULL) return -1; - if (ntohs(*fp) & ~0x7) + _frag_off = ntohs(*fp) & ~0x7; + if (_frag_off) { + if (target < 0 && + ((!ipv6_ext_hdr(hp->nexthdr)) || + nexthdr == NEXTHDR_NONE)) { + if (fragoff) + *fragoff = _frag_off; + return hp->nexthdr; + } return -1; + } hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) hdrlen = (hp->hdrlen + 2) << 2; @@ -2082,18 +1500,15 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, u8 target) } *offset = start; - return 0; + return nexthdr; } EXPORT_SYMBOL(ip6t_register_table); EXPORT_SYMBOL(ip6t_unregister_table); EXPORT_SYMBOL(ip6t_do_table); -EXPORT_SYMBOL(ip6t_register_match); -EXPORT_SYMBOL(ip6t_unregister_match); -EXPORT_SYMBOL(ip6t_register_target); -EXPORT_SYMBOL(ip6t_unregister_target); EXPORT_SYMBOL(ip6t_ext_hdr); EXPORT_SYMBOL(ipv6_find_hdr); +EXPORT_SYMBOL(ip6_masked_addrcmp); module_init(init); module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c index 8f5549b72720..306200c35057 100644 --- a/net/ipv6/netfilter/ip6t_HL.c +++ b/net/ipv6/netfilter/ip6t_HL.c @@ -62,7 +62,7 @@ static unsigned int ip6t_hl_target(struct sk_buff **pskb, } static int ip6t_hl_checkentry(const char *tablename, - const struct ip6t_entry *e, + const void *entry, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 0cd1d1bd9033..77c725832dec 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/skbuff.h> +#include <linux/if_arp.h> #include <linux/ip.h> #include <linux/spinlock.h> #include <linux/icmpv6.h> @@ -62,9 +63,8 @@ static void dump_packet(const struct nf_loginfo *info, return; } - /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000" */ - printk("SRC=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(ih->saddr)); - printk("DST=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(ih->daddr)); + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ + printk("SRC=" NIP6_FMT " DST=" NIP6_FMT " ", NIP6(ih->saddr), NIP6(ih->daddr)); /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", @@ -443,7 +443,7 @@ ip6t_log_target(struct sk_buff **pskb, static int ip6t_log_checkentry(const char *tablename, - const struct ip6t_entry *e, + const void *entry, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c deleted file mode 100644 index eab8fb864ee0..000000000000 --- a/net/ipv6/netfilter/ip6t_MARK.c +++ /dev/null @@ -1,81 +0,0 @@ -/* This is a module which is used for setting the NFMARK field of an skb. */ - -/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <net/checksum.h> - -#include <linux/netfilter_ipv6/ip6_tables.h> -#include <linux/netfilter_ipv6/ip6t_MARK.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); - -static unsigned int -target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const void *targinfo, - void *userinfo) -{ - const struct ip6t_mark_target_info *markinfo = targinfo; - - if((*pskb)->nfmark != markinfo->mark) - (*pskb)->nfmark = markinfo->mark; - - return IP6T_CONTINUE; -} - -static int -checkentry(const char *tablename, - const struct ip6t_entry *e, - void *targinfo, - unsigned int targinfosize, - unsigned int hook_mask) -{ - if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_mark_target_info))) { - printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", - targinfosize, - IP6T_ALIGN(sizeof(struct ip6t_mark_target_info))); - return 0; - } - - if (strcmp(tablename, "mangle") != 0) { - printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); - return 0; - } - - return 1; -} - -static struct ip6t_target ip6t_mark_reg = { - .name = "MARK", - .target = target, - .checkentry = checkentry, - .me = THIS_MODULE -}; - -static int __init init(void) -{ - printk(KERN_DEBUG "registering ipv6 mark target\n"); - if (ip6t_register_target(&ip6t_mark_reg)) - return -EINVAL; - - return 0; -} - -static void __exit fini(void) -{ - ip6t_unregister_target(&ip6t_mark_reg); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_NFQUEUE.c b/net/ipv6/netfilter/ip6t_NFQUEUE.c deleted file mode 100644 index c6e3730e7409..000000000000 --- a/net/ipv6/netfilter/ip6t_NFQUEUE.c +++ /dev/null @@ -1,70 +0,0 @@ -/* ip6tables module for using new netfilter netlink queue - * - * (C) 2005 by Harald Welte <laforge@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv6/ip6_tables.h> -#include <linux/netfilter_ipv4/ipt_NFQUEUE.h> - -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("ip6tables NFQUEUE target"); -MODULE_LICENSE("GPL"); - -static unsigned int -target(struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const void *targinfo, - void *userinfo) -{ - const struct ipt_NFQ_info *tinfo = targinfo; - - return NF_QUEUE_NR(tinfo->queuenum); -} - -static int -checkentry(const char *tablename, - const struct ip6t_entry *e, - void *targinfo, - unsigned int targinfosize, - unsigned int hook_mask) -{ - if (targinfosize != IP6T_ALIGN(sizeof(struct ipt_NFQ_info))) { - printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n", - targinfosize, - IP6T_ALIGN(sizeof(struct ipt_NFQ_info))); - return 0; - } - - return 1; -} - -static struct ip6t_target ipt_NFQ_reg = { - .name = "NFQUEUE", - .target = target, - .checkentry = checkentry, - .me = THIS_MODULE, -}; - -static int __init init(void) -{ - return ip6t_register_target(&ipt_NFQ_reg); -} - -static void __exit fini(void) -{ - ip6t_unregister_target(&ipt_NFQ_reg); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index b03e87adca93..c745717b4ce2 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -218,12 +218,13 @@ static unsigned int reject6_target(struct sk_buff **pskb, } static int check(const char *tablename, - const struct ip6t_entry *e, + const void *entry, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) { const struct ip6t_reject_info *rejinfo = targinfo; + const struct ip6t_entry *e = entry; if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_reject_info))) { DEBUGP("ip6t_REJECT: targinfosize %u != 0\n", targinfosize); diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index dde37793d20b..219a30365dff 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/ip.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> @@ -53,7 +54,7 @@ match(const struct sk_buff *skb, unsigned int ptr; unsigned int hdrlen = 0; - if (ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH) < 0) + if (ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL) < 0) return 0; ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah); @@ -97,7 +98,7 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int checkentry(const char *tablename, - const struct ip6t_ip6 *ip, + const void *entry, void *matchinfo, unsigned int matchinfosize, unsigned int hook_mask) diff --git a/net/ipv6/netfilter/ip6t_dst.c b/net/ipv6/netfilter/ip6t_dst.c index c450a635e54b..80fe82669ce2 100644 --- a/net/ipv6/netfilter/ip6t_dst.c +++ b/net/ipv6/netfilter/ip6t_dst.c @@ -71,9 +71,9 @@ match(const struct sk_buff *skb, unsigned int optlen; #if HOPBYHOP - if (ipv6_find_hdr(skb, &ptr, NEXTHDR_HOP) < 0) + if (ipv6_find_hdr(skb, &ptr, NEXTHDR_HOP, NULL) < 0) #else - if (ipv6_find_hdr(skb, &ptr, NEXTHDR_DEST) < 0) + if (ipv6_find_hdr(skb, &ptr, NEXTHDR_DEST, NULL) < 0) #endif return 0; @@ -178,7 +178,7 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int checkentry(const char *tablename, - const struct ip6t_ip6 *ip, + const void *info, void *matchinfo, unsigned int matchinfosize, unsigned int hook_mask) diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c index 24bc0cde43a1..724285df8711 100644 --- a/net/ipv6/netfilter/ip6t_esp.c +++ b/net/ipv6/netfilter/ip6t_esp.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/ip.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> @@ -55,7 +56,7 @@ match(const struct sk_buff *skb, /* Make sure this isn't an evil packet */ /*DEBUGP("ipv6_esp entered \n");*/ - if (ipv6_find_hdr(skb, &ptr, NEXTHDR_ESP) < 0) + if (ipv6_find_hdr(skb, &ptr, NEXTHDR_ESP, NULL) < 0) return 0; eh = skb_header_pointer(skb, ptr, sizeof(_esp), &_esp); @@ -75,7 +76,7 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int checkentry(const char *tablename, - const struct ip6t_ip6 *ip, + const void *ip, void *matchinfo, unsigned int matchinfosize, unsigned int hook_mask) diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c index 616c2cbcd54d..ddf5f571909c 100644 --- a/net/ipv6/netfilter/ip6t_eui64.c +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -62,7 +62,7 @@ match(const struct sk_buff *skb, static int ip6t_eui64_checkentry(const char *tablename, - const struct ip6t_ip6 *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c index 085d5f8eea29..a9964b946ed5 100644 --- a/net/ipv6/netfilter/ip6t_frag.c +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -52,7 +52,7 @@ match(const struct sk_buff *skb, const struct ip6t_frag *fraginfo = matchinfo; unsigned int ptr; - if (ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT) < 0) + if (ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL) < 0) return 0; fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag); @@ -115,7 +115,7 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int checkentry(const char *tablename, - const struct ip6t_ip6 *ip, + const void *ip, void *matchinfo, unsigned int matchinfosize, unsigned int hook_mask) diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c index 1d09485111d0..ed8ded18bbd4 100644 --- a/net/ipv6/netfilter/ip6t_hbh.c +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -71,9 +71,9 @@ match(const struct sk_buff *skb, unsigned int optlen; #if HOPBYHOP - if (ipv6_find_hdr(skb, &ptr, NEXTHDR_HOP) < 0) + if (ipv6_find_hdr(skb, &ptr, NEXTHDR_HOP, NULL) < 0) #else - if (ipv6_find_hdr(skb, &ptr, NEXTHDR_DEST) < 0) + if (ipv6_find_hdr(skb, &ptr, NEXTHDR_DEST, NULL) < 0) #endif return 0; @@ -178,7 +178,7 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int checkentry(const char *tablename, - const struct ip6t_ip6 *ip, + const void *entry, void *matchinfo, unsigned int matchinfosize, unsigned int hook_mask) diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c index 0beaff5471dd..c5d9079f2d9d 100644 --- a/net/ipv6/netfilter/ip6t_hl.c +++ b/net/ipv6/netfilter/ip6t_hl.c @@ -48,7 +48,7 @@ static int match(const struct sk_buff *skb, const struct net_device *in, return 0; } -static int checkentry(const char *tablename, const struct ip6t_ip6 *ip, +static int checkentry(const char *tablename, const void *entry, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 32e67f05845b..fda1ceaf5a29 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -124,7 +124,7 @@ ipv6header_match(const struct sk_buff *skb, static int ipv6header_checkentry(const char *tablename, - const struct ip6t_ip6 *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) diff --git a/net/ipv6/netfilter/ip6t_length.c b/net/ipv6/netfilter/ip6t_length.c deleted file mode 100644 index e0537d3811d5..000000000000 --- a/net/ipv6/netfilter/ip6t_length.c +++ /dev/null @@ -1,66 +0,0 @@ -/* Length Match - IPv6 Port */ - -/* (C) 1999-2001 James Morris <jmorros@intercode.com.au> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/netfilter_ipv6/ip6t_length.h> -#include <linux/netfilter_ipv6/ip6_tables.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); -MODULE_DESCRIPTION("IPv6 packet length match"); - -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) -{ - const struct ip6t_length_info *info = matchinfo; - u_int16_t pktlen = ntohs(skb->nh.ipv6h->payload_len) + sizeof(struct ipv6hdr); - - return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; -} - -static int -checkentry(const char *tablename, - const struct ip6t_ip6 *ip, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) -{ - if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_length_info))) - return 0; - - return 1; -} - -static struct ip6t_match length_match = { - .name = "length", - .match = &match, - .checkentry = &checkentry, - .me = THIS_MODULE, -}; - -static int __init init(void) -{ - return ip6t_register_match(&length_match); -} - -static void __exit fini(void) -{ - ip6t_unregister_match(&length_match); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_limit.c b/net/ipv6/netfilter/ip6t_limit.c deleted file mode 100644 index fb782f610be2..000000000000 --- a/net/ipv6/netfilter/ip6t_limit.c +++ /dev/null @@ -1,147 +0,0 @@ -/* Kernel module to control the rate - * - * 2 September 1999: Changed from the target RATE to the match - * `limit', removed logging. Did I mention that - * Alexey is a fucking genius? - * Rusty Russell (rusty@rustcorp.com.au). */ - -/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> - * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> - -#include <linux/netfilter_ipv6/ip6_tables.h> -#include <linux/netfilter_ipv6/ip6t_limit.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); -MODULE_DESCRIPTION("rate limiting within ip6tables"); - -/* The algorithm used is the Simple Token Bucket Filter (TBF) - * see net/sched/sch_tbf.c in the linux source tree - */ - -static DEFINE_SPINLOCK(limit_lock); - -/* Rusty: This is my (non-mathematically-inclined) understanding of - this algorithm. The `average rate' in jiffies becomes your initial - amount of credit `credit' and the most credit you can ever have - `credit_cap'. The `peak rate' becomes the cost of passing the - test, `cost'. - - `prev' tracks the last packet hit: you gain one credit per jiffy. - If you get credit balance more than this, the extra credit is - discarded. Every time the match passes, you lose `cost' credits; - if you don't have that many, the test fails. - - See Alexey's formal explanation in net/sched/sch_tbf.c. - - To avoid underflow, we multiply by 128 (ie. you get 128 credits per - jiffy). Hence a cost of 2^32-1, means one pass per 32768 seconds - at 1024HZ (or one every 9 hours). A cost of 1 means 12800 passes - per second at 100HZ. */ - -#define CREDITS_PER_JIFFY 128 - -static int -ip6t_limit_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) -{ - struct ip6t_rateinfo *r = ((struct ip6t_rateinfo *)matchinfo)->master; - unsigned long now = jiffies; - - spin_lock_bh(&limit_lock); - r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY; - if (r->credit > r->credit_cap) - r->credit = r->credit_cap; - - if (r->credit >= r->cost) { - /* We're not limited. */ - r->credit -= r->cost; - spin_unlock_bh(&limit_lock); - return 1; - } - - spin_unlock_bh(&limit_lock); - return 0; -} - -/* Precision saver. */ -static u_int32_t -user2credits(u_int32_t user) -{ - /* If multiplying would overflow... */ - if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) - /* Divide first. */ - return (user / IP6T_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; - - return (user * HZ * CREDITS_PER_JIFFY) / IP6T_LIMIT_SCALE; -} - -static int -ip6t_limit_checkentry(const char *tablename, - const struct ip6t_ip6 *ip, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) -{ - struct ip6t_rateinfo *r = matchinfo; - - if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_rateinfo))) - return 0; - - /* Check for overflow. */ - if (r->burst == 0 - || user2credits(r->avg * r->burst) < user2credits(r->avg)) { - printk("Call rusty: overflow in ip6t_limit: %u/%u\n", - r->avg, r->burst); - return 0; - } - - /* User avg in seconds * IP6T_LIMIT_SCALE: convert to jiffies * - 128. */ - r->prev = jiffies; - r->credit = user2credits(r->avg * r->burst); /* Credits full. */ - r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ - r->cost = user2credits(r->avg); - - /* For SMP, we only want to use one set of counters. */ - r->master = r; - - return 1; -} - -static struct ip6t_match ip6t_limit_reg = { - .name = "limit", - .match = ip6t_limit_match, - .checkentry = ip6t_limit_checkentry, - .me = THIS_MODULE, -}; - -static int __init init(void) -{ - if (ip6t_register_match(&ip6t_limit_reg)) - return -EINVAL; - return 0; -} - -static void __exit fini(void) -{ - ip6t_unregister_match(&ip6t_limit_reg); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_mac.c b/net/ipv6/netfilter/ip6t_mac.c deleted file mode 100644 index 526d43e37234..000000000000 --- a/net/ipv6/netfilter/ip6t_mac.c +++ /dev/null @@ -1,80 +0,0 @@ -/* Kernel module to match MAC address parameters. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/if_ether.h> - -#include <linux/netfilter_ipv6/ip6t_mac.h> -#include <linux/netfilter_ipv6/ip6_tables.h> - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MAC address matching module for IPv6"); -MODULE_AUTHOR("Netfilter Core Teaam <coreteam@netfilter.org>"); - -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) -{ - const struct ip6t_mac_info *info = matchinfo; - - /* Is mac pointer valid? */ - return (skb->mac.raw >= skb->head - && (skb->mac.raw + ETH_HLEN) <= skb->data - /* If so, compare... */ - && ((memcmp(eth_hdr(skb)->h_source, info->srcaddr, ETH_ALEN) - == 0) ^ info->invert)); -} - -static int -ip6t_mac_checkentry(const char *tablename, - const struct ip6t_ip6 *ip, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) -{ - if (hook_mask - & ~((1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_IN) - | (1 << NF_IP6_FORWARD))) { - printk("ip6t_mac: only valid for PRE_ROUTING, LOCAL_IN or" - " FORWARD\n"); - return 0; - } - - if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_mac_info))) - return 0; - - return 1; -} - -static struct ip6t_match mac_match = { - .name = "mac", - .match = &match, - .checkentry = &ip6t_mac_checkentry, - .me = THIS_MODULE, -}; - -static int __init init(void) -{ - return ip6t_register_match(&mac_match); -} - -static void __exit fini(void) -{ - ip6t_unregister_match(&mac_match); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_mark.c b/net/ipv6/netfilter/ip6t_mark.c deleted file mode 100644 index affc3de364fc..000000000000 --- a/net/ipv6/netfilter/ip6t_mark.c +++ /dev/null @@ -1,66 +0,0 @@ -/* Kernel module to match NFMARK values. */ - -/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - - -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter_ipv6/ip6t_mark.h> -#include <linux/netfilter_ipv6/ip6_tables.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("ip6tables mark match"); - -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) -{ - const struct ip6t_mark_info *info = matchinfo; - - return ((skb->nfmark & info->mask) == info->mark) ^ info->invert; -} - -static int -checkentry(const char *tablename, - const struct ip6t_ip6 *ip, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) -{ - if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_mark_info))) - return 0; - - return 1; -} - -static struct ip6t_match mark_match = { - .name = "mark", - .match = &match, - .checkentry = &checkentry, - .me = THIS_MODULE, -}; - -static int __init init(void) -{ - return ip6t_register_match(&mark_match); -} - -static void __exit fini(void) -{ - ip6t_unregister_match(&mark_match); -} - -module_init(init); -module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_multiport.c b/net/ipv6/netfilter/ip6t_multiport.c index 6e3246153fa3..49f7829dfbc2 100644 --- a/net/ipv6/netfilter/ip6t_multiport.c +++ b/net/ipv6/netfilter/ip6t_multiport.c @@ -84,11 +84,12 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int checkentry(const char *tablename, - const struct ip6t_ip6 *ip, + const void *info, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { + const struct ip6t_ip6 *ip = info; const struct ip6t_multiport *multiinfo = matchinfo; if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_multiport))) diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c index 4de4cdad4b7d..5409b375b512 100644 --- a/net/ipv6/netfilter/ip6t_owner.c +++ b/net/ipv6/netfilter/ip6t_owner.c @@ -53,7 +53,7 @@ match(const struct sk_buff *skb, static int checkentry(const char *tablename, - const struct ip6t_ip6 *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) diff --git a/net/ipv6/netfilter/ip6t_policy.c b/net/ipv6/netfilter/ip6t_policy.c new file mode 100644 index 000000000000..13fedad48c1d --- /dev/null +++ b/net/ipv6/netfilter/ip6t_policy.c @@ -0,0 +1,175 @@ +/* IP tables module for matching IPsec policy + * + * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <net/xfrm.h> + +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_policy.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("IPtables IPsec policy matching module"); +MODULE_LICENSE("GPL"); + + +static inline int +match_xfrm_state(struct xfrm_state *x, const struct ip6t_policy_elem *e) +{ +#define MATCH_ADDR(x,y,z) (!e->match.x || \ + ((ip6_masked_addrcmp((z), &e->x, &e->y)) == 0) ^ e->invert.x) +#define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) + + return MATCH_ADDR(saddr, smask, (struct in6_addr *)&x->props.saddr.a6) && + MATCH_ADDR(daddr, dmask, (struct in6_addr *)&x->id.daddr.a6) && + MATCH(proto, x->id.proto) && + MATCH(mode, x->props.mode) && + MATCH(spi, x->id.spi) && + MATCH(reqid, x->props.reqid); +} + +static int +match_policy_in(const struct sk_buff *skb, const struct ip6t_policy_info *info) +{ + const struct ip6t_policy_elem *e; + struct sec_path *sp = skb->sp; + int strict = info->flags & IP6T_POLICY_MATCH_STRICT; + int i, pos; + + if (sp == NULL) + return -1; + if (strict && info->len != sp->len) + return 0; + + for (i = sp->len - 1; i >= 0; i--) { + pos = strict ? i - sp->len + 1 : 0; + if (pos >= info->len) + return 0; + e = &info->pol[pos]; + + if (match_xfrm_state(sp->x[i].xvec, e)) { + if (!strict) + return 1; + } else if (strict) + return 0; + } + + return strict ? 1 : 0; +} + +static int +match_policy_out(const struct sk_buff *skb, const struct ip6t_policy_info *info) +{ + const struct ip6t_policy_elem *e; + struct dst_entry *dst = skb->dst; + int strict = info->flags & IP6T_POLICY_MATCH_STRICT; + int i, pos; + + if (dst->xfrm == NULL) + return -1; + + for (i = 0; dst && dst->xfrm; dst = dst->child, i++) { + pos = strict ? i : 0; + if (pos >= info->len) + return 0; + e = &info->pol[pos]; + + if (match_xfrm_state(dst->xfrm, e)) { + if (!strict) + return 1; + } else if (strict) + return 0; + } + + return strict ? 1 : 0; +} + +static int match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_policy_info *info = matchinfo; + int ret; + + if (info->flags & IP6T_POLICY_MATCH_IN) + ret = match_policy_in(skb, info); + else + ret = match_policy_out(skb, info); + + if (ret < 0) + ret = info->flags & IP6T_POLICY_MATCH_NONE ? 1 : 0; + else if (info->flags & IP6T_POLICY_MATCH_NONE) + ret = 0; + + return ret; +} + +static int checkentry(const char *tablename, const struct ip6t_ip6 *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + struct ip6t_policy_info *info = matchinfo; + + if (matchsize != IP6T_ALIGN(sizeof(*info))) { + printk(KERN_ERR "ip6t_policy: matchsize %u != %zu\n", + matchsize, IP6T_ALIGN(sizeof(*info))); + return 0; + } + if (!(info->flags & (IP6T_POLICY_MATCH_IN|IP6T_POLICY_MATCH_OUT))) { + printk(KERN_ERR "ip6t_policy: neither incoming nor " + "outgoing policy selected\n"); + return 0; + } + if (hook_mask & (1 << NF_IP6_PRE_ROUTING | 1 << NF_IP6_LOCAL_IN) + && info->flags & IP6T_POLICY_MATCH_OUT) { + printk(KERN_ERR "ip6t_policy: output policy not valid in " + "PRE_ROUTING and INPUT\n"); + return 0; + } + if (hook_mask & (1 << NF_IP6_POST_ROUTING | 1 << NF_IP6_LOCAL_OUT) + && info->flags & IP6T_POLICY_MATCH_IN) { + printk(KERN_ERR "ip6t_policy: input policy not valid in " + "POST_ROUTING and OUTPUT\n"); + return 0; + } + if (info->len > IP6T_POLICY_MAX_ELEM) { + printk(KERN_ERR "ip6t_policy: too many policy elements\n"); + return 0; + } + + return 1; +} + +static struct ip6t_match policy_match = { + .name = "policy", + .match = match, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&policy_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&policy_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index beb2fd5cebbb..8465b4375855 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -58,7 +58,7 @@ match(const struct sk_buff *skb, unsigned int ret = 0; struct in6_addr *ap, _addr; - if (ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING) < 0) + if (ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL) < 0) return 0; rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route); @@ -183,7 +183,7 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static int checkentry(const char *tablename, - const struct ip6t_ip6 *ip, + const void *entry, void *matchinfo, unsigned int matchinfosize, unsigned int hook_mask) diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 4c0028671c20..ce4a968e1f70 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -97,6 +97,7 @@ static struct ip6t_table packet_filter = { .valid_hooks = FILTER_VALID_HOOKS, .lock = RW_LOCK_UNLOCKED, .me = THIS_MODULE, + .af = AF_INET6, }; /* The work comes in here from netfilter.c. */ diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 85c1e6eada19..30a4627e000d 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -127,6 +127,7 @@ static struct ip6t_table packet_mangler = { .valid_hooks = MANGLE_VALID_HOOKS, .lock = RW_LOCK_UNLOCKED, .me = THIS_MODULE, + .af = AF_INET6, }; /* The work comes in here from netfilter.c. */ diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index c2982efd14af..db28ba3855e2 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -106,11 +106,12 @@ static struct } }; -static struct ip6t_table packet_raw = { +static struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .lock = RW_LOCK_UNLOCKED, - .me = THIS_MODULE + .me = THIS_MODULE, + .af = AF_INET6, }; /* The work comes in here from netfilter.c. */ diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 753a3ae8502b..ac702a29dd16 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -74,7 +74,7 @@ static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, static int ipv6_print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple) { - return seq_printf(s, "src=%x:%x:%x:%x:%x:%x:%x:%x dst=%x:%x:%x:%x:%x:%x:%x:%x ", + return seq_printf(s, "src=" NIP6_FMT " dst=" NIP6_FMT " ", NIP6(*((struct in6_addr *)tuple->src.u3.ip6)), NIP6(*((struct in6_addr *)tuple->dst.u3.ip6))); } @@ -335,10 +335,10 @@ static struct nf_hook_ops ipv6_conntrack_local_in_ops = { #ifdef CONFIG_SYSCTL /* From nf_conntrack_proto_icmpv6.c */ -extern unsigned long nf_ct_icmpv6_timeout; +extern unsigned int nf_ct_icmpv6_timeout; /* From nf_conntrack_frag6.c */ -extern unsigned long nf_ct_frag6_timeout; +extern unsigned int nf_ct_frag6_timeout; extern unsigned int nf_ct_frag6_low_thresh; extern unsigned int nf_ct_frag6_high_thresh; @@ -401,6 +401,48 @@ static ctl_table nf_ct_net_table[] = { }; #endif +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static int ipv6_tuple_to_nfattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4, + &tuple->src.u3.ip6); + NFA_PUT(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4, + &tuple->dst.u3.ip6); + return 0; + +nfattr_failure: + return -1; +} + +static const size_t cta_min_ip[CTA_IP_MAX] = { + [CTA_IP_V6_SRC-1] = sizeof(u_int32_t)*4, + [CTA_IP_V6_DST-1] = sizeof(u_int32_t)*4, +}; + +static int ipv6_nfattr_to_tuple(struct nfattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V6_SRC-1] || !tb[CTA_IP_V6_DST-1]) + return -EINVAL; + + if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) + return -EINVAL; + + memcpy(&t->src.u3.ip6, NFA_DATA(tb[CTA_IP_V6_SRC-1]), + sizeof(u_int32_t) * 4); + memcpy(&t->dst.u3.ip6, NFA_DATA(tb[CTA_IP_V6_DST-1]), + sizeof(u_int32_t) * 4); + + return 0; +} +#endif + struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .l3proto = PF_INET6, .name = "ipv6", @@ -409,6 +451,11 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .print_tuple = ipv6_print_tuple, .print_conntrack = ipv6_print_conntrack, .prepare = ipv6_prepare, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = ipv6_tuple_to_nfattr, + .nfattr_to_tuple = ipv6_nfattr_to_tuple, +#endif .get_features = ipv6_get_features, .me = THIS_MODULE, }; @@ -537,7 +584,7 @@ MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>"); static int __init init(void) { - need_nf_conntrack(); + need_conntrack(); return init_or_cleanup(1); } @@ -548,9 +595,3 @@ static void __exit fini(void) module_init(init); module_exit(fini); - -void need_ip6_conntrack(void) -{ -} - -EXPORT_SYMBOL(need_ip6_conntrack); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index a7e03cfacd06..09945c333055 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -57,17 +57,17 @@ static int icmpv6_pkt_to_tuple(const struct sk_buff *skb, return 1; } +/* Add 1; spaces filled with 0. */ +static u_int8_t invmap[] = { + [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, + [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, + [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1, + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 +}; + static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { - /* Add 1; spaces filled with 0. */ - static u_int8_t invmap[] = { - [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, - [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, - [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1, - [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 - }; - int type = orig->dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(invmap) || !invmap[type]) return 0; @@ -185,7 +185,7 @@ icmpv6_error_message(struct sk_buff *skb, return -NF_ACCEPT; } - inproto = nf_ct_find_proto(PF_INET6, inprotonum); + inproto = __nf_ct_proto_find(PF_INET6, inprotonum); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum, @@ -255,6 +255,60 @@ skipped: return icmpv6_error_message(skb, dataoff, ctinfo, hooknum); } +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> +static int icmpv6_tuple_to_nfattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *t) +{ + NFA_PUT(skb, CTA_PROTO_ICMPV6_ID, sizeof(u_int16_t), + &t->src.u.icmp.id); + NFA_PUT(skb, CTA_PROTO_ICMPV6_TYPE, sizeof(u_int8_t), + &t->dst.u.icmp.type); + NFA_PUT(skb, CTA_PROTO_ICMPV6_CODE, sizeof(u_int8_t), + &t->dst.u.icmp.code); + + return 0; + +nfattr_failure: + return -1; +} + +static const size_t cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_ICMPV6_TYPE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMPV6_CODE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMPV6_ID-1] = sizeof(u_int16_t) +}; + +static int icmpv6_nfattr_to_tuple(struct nfattr *tb[], + struct nf_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMPV6_TYPE-1] + || !tb[CTA_PROTO_ICMPV6_CODE-1] + || !tb[CTA_PROTO_ICMPV6_ID-1]) + return -EINVAL; + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + tuple->dst.u.icmp.type = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_TYPE-1]); + tuple->dst.u.icmp.code = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_CODE-1]); + tuple->src.u.icmp.id = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_ID-1]); + + if (tuple->dst.u.icmp.type < 128 + || tuple->dst.u.icmp.type - 128 >= sizeof(invmap) + || !invmap[tuple->dst.u.icmp.type - 128]) + return -EINVAL; + + return 0; +} +#endif + struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 = { .l3proto = PF_INET6, @@ -267,6 +321,11 @@ struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 = .packet = icmpv6_packet, .new = icmpv6_new, .error = icmpv6_error, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = icmpv6_tuple_to_nfattr, + .nfattr_to_tuple = icmpv6_nfattr_to_tuple, +#endif }; EXPORT_SYMBOL(nf_conntrack_protocol_icmpv6); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c2c52af9e560..84ef9a13108d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -70,8 +70,8 @@ struct nf_ct_frag6_skb_cb struct nf_ct_frag6_queue { - struct nf_ct_frag6_queue *next; - struct list_head lru_list; /* lru list member */ + struct hlist_node list; + struct list_head lru_list; /* lru list member */ __u32 id; /* fragment id */ struct in6_addr saddr; @@ -90,24 +90,21 @@ struct nf_ct_frag6_queue #define FIRST_IN 2 #define LAST_IN 1 __u16 nhoffset; - struct nf_ct_frag6_queue **pprev; }; /* Hash table. */ #define FRAG6Q_HASHSZ 64 -static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ]; -static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED; +static struct hlist_head nf_ct_frag6_hash[FRAG6Q_HASHSZ]; +static DEFINE_RWLOCK(nf_ct_frag6_lock); static u32 nf_ct_frag6_hash_rnd; static LIST_HEAD(nf_ct_frag6_lru_list); int nf_ct_frag6_nqueues = 0; static __inline__ void __fq_unlink(struct nf_ct_frag6_queue *fq) { - if (fq->next) - fq->next->pprev = fq->pprev; - *fq->pprev = fq->next; + hlist_del(&fq->list); list_del(&fq->lru_list); nf_ct_frag6_nqueues--; } @@ -158,28 +155,18 @@ static void nf_ct_frag6_secret_rebuild(unsigned long dummy) get_random_bytes(&nf_ct_frag6_hash_rnd, sizeof(u32)); for (i = 0; i < FRAG6Q_HASHSZ; i++) { struct nf_ct_frag6_queue *q; + struct hlist_node *p, *n; - q = nf_ct_frag6_hash[i]; - while (q) { - struct nf_ct_frag6_queue *next = q->next; + hlist_for_each_entry_safe(q, p, n, &nf_ct_frag6_hash[i], list) { unsigned int hval = ip6qhashfn(q->id, &q->saddr, &q->daddr); - if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; - + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = nf_ct_frag6_hash[hval]) != NULL) - q->next->pprev = &q->next; - nf_ct_frag6_hash[hval] = q; - q->pprev = &nf_ct_frag6_hash[hval]; + hlist_add_head(&q->list, + &nf_ct_frag6_hash[hval]); } - - q = next; } } write_unlock(&nf_ct_frag6_lock); @@ -314,15 +301,17 @@ out: /* Creation primitives. */ - static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash, struct nf_ct_frag6_queue *fq_in) { struct nf_ct_frag6_queue *fq; +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif write_lock(&nf_ct_frag6_lock); #ifdef CONFIG_SMP - for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) { + hlist_for_each_entry(fq, n, &nf_ct_frag6_hash[hash], list) { if (fq->id == fq_in->id && !ipv6_addr_cmp(&fq_in->saddr, &fq->saddr) && !ipv6_addr_cmp(&fq_in->daddr, &fq->daddr)) { @@ -340,10 +329,7 @@ static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash, atomic_inc(&fq->refcnt); atomic_inc(&fq->refcnt); - if ((fq->next = nf_ct_frag6_hash[hash]) != NULL) - fq->next->pprev = &fq->next; - nf_ct_frag6_hash[hash] = fq; - fq->pprev = &nf_ct_frag6_hash[hash]; + hlist_add_head(&fq->list, &nf_ct_frag6_hash[hash]); INIT_LIST_HEAD(&fq->lru_list); list_add_tail(&fq->lru_list, &nf_ct_frag6_lru_list); nf_ct_frag6_nqueues++; @@ -371,7 +357,7 @@ nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct init_timer(&fq->timer); fq->timer.function = nf_ct_frag6_expire; fq->timer.data = (long) fq; - fq->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&fq->lock); atomic_set(&fq->refcnt, 1); return nf_ct_frag6_intern(hash, fq); @@ -384,10 +370,11 @@ static __inline__ struct nf_ct_frag6_queue * fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) { struct nf_ct_frag6_queue *fq; + struct hlist_node *n; unsigned int hash = ip6qhashfn(id, src, dst); read_lock(&nf_ct_frag6_lock); - for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) { + hlist_for_each_entry(fq, n, &nf_ct_frag6_hash[hash], list) { if (fq->id == id && !ipv6_addr_cmp(src, &fq->saddr) && !ipv6_addr_cmp(dst, &fq->daddr)) { diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a66900cda2af..66f1d12ea578 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -32,6 +32,7 @@ #include <linux/icmpv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> +#include <linux/skbuff.h> #include <asm/uaccess.h> #include <asm/ioctls.h> #include <asm/bug.h> @@ -433,25 +434,14 @@ out: return err; csum_copy_err: - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } + skb_kill_datagram(sk, skb, flags); /* Error for blocking case is chosen to masquerade as some normal condition. */ err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; /* FIXME: increment a raw6 drops counter here */ - goto out_free; + goto out; } static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5d316cb72ec9..15e1456b3f18 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -581,7 +581,6 @@ err: * the last and the first frames arrived and all the bits are here. */ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in, - unsigned int *nhoffp, struct net_device *dev) { struct sk_buff *fp, *head = fq->fragments; @@ -654,6 +653,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in, head->dev = dev; skb_set_timestamp(head, &fq->stamp); head->nh.ipv6h->payload_len = htons(payload_len); + IP6CB(head)->nhoff = nhoff; *skb_in = head; @@ -663,7 +663,6 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in, IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS); fq->fragments = NULL; - *nhoffp = nhoff; return 1; out_oversize: @@ -678,7 +677,7 @@ out_fail: return -1; } -static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +static int ipv6_frag_rcv(struct sk_buff **skbp) { struct sk_buff *skb = *skbp; struct net_device *dev = skb->dev; @@ -710,7 +709,7 @@ static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp) skb->h.raw += sizeof(struct frag_hdr); IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS); - *nhoffp = (u8*)fhdr - skb->nh.raw; + IP6CB(skb)->nhoff = (u8*)fhdr - skb->nh.raw; return 1; } @@ -722,11 +721,11 @@ static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp) spin_lock(&fq->lock); - ip6_frag_queue(fq, skb, fhdr, *nhoffp); + ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); if (fq->last_in == (FIRST_IN|LAST_IN) && fq->meat == fq->len) - ret = ip6_frag_reasm(fq, skbp, nhoffp, dev); + ret = ip6_frag_reasm(fq, skbp, dev); spin_unlock(&fq->lock); fq_put(fq, NULL); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 66140f13d119..e0d3ad02ffb5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -24,6 +24,7 @@ * reachable. otherwise, round-robin the list. */ +#include <linux/capability.h> #include <linux/config.h> #include <linux/errno.h> #include <linux/types.h> diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index c3123c9e1a8e..c2d3e17beae6 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -20,6 +20,7 @@ #include <linux/config.h> #include <linux/module.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -33,6 +34,7 @@ #include <asm/uaccess.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> +#include <linux/if_ether.h> #include <net/sock.h> #include <net/snmp.h> @@ -183,7 +185,7 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int if (dev == NULL) return NULL; - nt = dev->priv; + nt = netdev_priv(dev); dev->init = ipip6_tunnel_init; nt->parms = *parms; @@ -209,7 +211,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev) write_unlock_bh(&ipip6_lock); dev_put(dev); } else { - ipip6_tunnel_unlink((struct ip_tunnel*)dev->priv); + ipip6_tunnel_unlink(netdev_priv(dev)); dev_put(dev); } } @@ -345,7 +347,7 @@ out: rt6i = rt6_lookup(&iph6->daddr, &iph6->saddr, NULL, 0); if (rt6i && rt6i->rt6i_dev && rt6i->rt6i_dev->type == ARPHRD_SIT) { - struct ip_tunnel * t = (struct ip_tunnel*)rt6i->rt6i_dev->priv; + struct ip_tunnel *t = netdev_priv(rt6i->rt6i_dev); if (rel_type == ICMPV6_TIME_EXCEED && t->parms.iph.ttl) { rel_type = ICMPV6_DEST_UNREACH; rel_code = ICMPV6_ADDR_UNREACH; @@ -380,6 +382,7 @@ static int ipip6_rcv(struct sk_buff *skb) skb->mac.raw = skb->nh.raw; skb->nh.raw = skb->data; memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + IPCB(skb)->flags = 0; skb->protocol = htons(ETH_P_IPV6); skb->pkt_type = PACKET_HOST; tunnel->stat.rx_packets++; @@ -422,7 +425,7 @@ static inline u32 try_6to4(struct in6_addr *v6dst) static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { - struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct ip_tunnel *tunnel = netdev_priv(dev); struct net_device_stats *stats = &tunnel->stat; struct iphdr *tiph = &tunnel->parms.iph; struct ipv6hdr *iph6 = skb->nh.ipv6h; @@ -551,6 +554,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb->h.raw = skb->nh.raw; skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags = 0; dst_release(skb->dst); skb->dst = &rt->u.dst; @@ -607,7 +611,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) t = ipip6_tunnel_locate(&p, 0); } if (t == NULL) - t = (struct ip_tunnel*)dev->priv; + t = netdev_priv(dev); memcpy(&p, &t->parms, sizeof(p)); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; @@ -644,7 +648,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = -EINVAL; break; } - t = (struct ip_tunnel*)dev->priv; + t = netdev_priv(dev); ipip6_tunnel_unlink(t); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; @@ -680,7 +684,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) if ((t = ipip6_tunnel_locate(&p, 0)) == NULL) goto done; err = -EPERM; - if (t == ipip6_fb_tunnel_dev->priv) + if (t == netdev_priv(ipip6_fb_tunnel_dev)) goto done; dev = t->dev; } @@ -697,7 +701,7 @@ done: static struct net_device_stats *ipip6_tunnel_get_stats(struct net_device *dev) { - return &(((struct ip_tunnel*)dev->priv)->stat); + return &(((struct ip_tunnel*)netdev_priv(dev))->stat); } static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) @@ -720,7 +724,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->type = ARPHRD_SIT; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); - dev->mtu = 1500 - sizeof(struct iphdr); + dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); dev->flags = IFF_NOARP; dev->iflink = 0; dev->addr_len = 4; @@ -732,7 +736,7 @@ static int ipip6_tunnel_init(struct net_device *dev) struct ip_tunnel *tunnel; struct iphdr *iph; - tunnel = (struct ip_tunnel*)dev->priv; + tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; tunnel->dev = dev; @@ -772,7 +776,7 @@ static int ipip6_tunnel_init(struct net_device *dev) static int __init ipip6_fb_tunnel_init(struct net_device *dev) { - struct ip_tunnel *tunnel = dev->priv; + struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; tunnel->dev = dev; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8827389abaf7..66d04004afda 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -48,6 +48,7 @@ #include <net/tcp.h> #include <net/ndisc.h> #include <net/inet6_hashtables.h> +#include <net/inet6_connection_sock.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> @@ -59,232 +60,45 @@ #include <net/addrconf.h> #include <net/snmp.h> #include <net/dsfield.h> +#include <net/timewait_sock.h> #include <asm/uaccess.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +/* Socket used for sending RSTs and ACKs */ +static struct socket *tcp6_socket; + static void tcp_v6_send_reset(struct sk_buff *skb); static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req); -static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, +static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); - -static struct tcp_func ipv6_mapped; -static struct tcp_func ipv6_specific; - -static inline int tcp_v6_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb) -{ - const struct sock *sk2; - const struct hlist_node *node; - - /* We must walk the whole port owner list in this case. -DaveM */ - sk_for_each_bound(sk2, node, &tb->owners) { - if (sk != sk2 && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && - (!sk->sk_reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) && - ipv6_rcv_saddr_equal(sk, sk2)) - break; - } - return node != NULL; -} +static struct inet_connection_sock_af_ops ipv6_mapped; +static struct inet_connection_sock_af_ops ipv6_specific; -/* Grrr, addr_type already calculated by caller, but I don't want - * to add some silly "cookie" argument to this method just for that. - * But it doesn't matter, the recalculation is in the rarest path - * this function ever takes. - */ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) { - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - struct hlist_node *node; - int ret; - - local_bh_disable(); - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; - - do { - head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == rover) - goto next; - break; - next: - spin_unlock(&head->lock); - if (++rover > high) - rover = low; - } while (--remaining > 0); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (unlikely(remaining <= 0)) - goto fail; - - /* OK, here is the one we will use. */ - snum = rover; - } else { - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == snum) - goto tb_found; - } - tb = NULL; - goto tb_not_found; -tb_found: - if (tb && !hlist_empty(&tb->owners)) { - if (tb->fastreuse > 0 && sk->sk_reuse && - sk->sk_state != TCP_LISTEN) { - goto success; - } else { - ret = 1; - if (tcp_v6_bind_conflict(sk, tb)) - goto fail_unlock; - } - } -tb_not_found: - ret = 1; - if (tb == NULL) { - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum); - if (tb == NULL) - goto fail_unlock; - } - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else - tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; - -success: - if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); - ret = 0; - -fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); - return ret; -} - -static __inline__ void __tcp_v6_hash(struct sock *sk) -{ - struct hlist_head *list; - rwlock_t *lock; - - BUG_TRAP(sk_unhashed(sk)); - - if (sk->sk_state == TCP_LISTEN) { - list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &tcp_hashinfo.lhash_lock; - inet_listen_wlock(&tcp_hashinfo); - } else { - unsigned int hash; - sk->sk_hash = hash = inet6_sk_ehashfn(sk); - hash &= (tcp_hashinfo.ehash_size - 1); - list = &tcp_hashinfo.ehash[hash].chain; - lock = &tcp_hashinfo.ehash[hash].lock; - write_lock(lock); - } - - __sk_add_node(sk, list); - sock_prot_inc_use(sk->sk_prot); - write_unlock(lock); + return inet_csk_get_port(&tcp_hashinfo, sk, snum, + inet6_csk_bind_conflict); } - static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { - struct tcp_sock *tp = tcp_sk(sk); - - if (tp->af_specific == &ipv6_mapped) { + if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { tcp_prot.hash(sk); return; } local_bh_disable(); - __tcp_v6_hash(sk); + __inet6_hash(&tcp_hashinfo, sk); local_bh_enable(); } } -/* - * Open request hash tables. - */ - -static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd) -{ - u32 a, b, c; - - a = raddr->s6_addr32[0]; - b = raddr->s6_addr32[1]; - c = raddr->s6_addr32[2]; - - a += JHASH_GOLDEN_RATIO; - b += JHASH_GOLDEN_RATIO; - c += rnd; - __jhash_mix(a, b, c); - - a += raddr->s6_addr32[3]; - b += (u32) rport; - __jhash_mix(a, b, c); - - return c & (TCP_SYNQ_HSIZE - 1); -} - -static struct request_sock *tcp_v6_search_req(const struct sock *sk, - struct request_sock ***prevp, - __u16 rport, - struct in6_addr *raddr, - struct in6_addr *laddr, - int iif) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req, **prev; - - for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; - (req = *prev) != NULL; - prev = &req->dl_next) { - const struct tcp6_request_sock *treq = tcp6_rsk(req); - - if (inet_rsk(req)->rmt_port == rport && - req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&treq->rmt_addr, raddr) && - ipv6_addr_equal(&treq->loc_addr, laddr) && - (!treq->iif || treq->iif == iif)) { - BUG_TRAP(req->sk == NULL); - *prevp = prev; - return req; - } - } - - return NULL; -} - static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, struct in6_addr *saddr, struct in6_addr *daddr, @@ -308,195 +122,12 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) } } -static int __tcp_v6_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - const struct in6_addr *daddr = &np->rcv_saddr; - const struct in6_addr *saddr = &np->daddr; - const int dif = sk->sk_bound_dev_if; - const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2); - - tw = inet_twsk(sk2); - - if(*((__u32 *)&(tw->tw_dport)) == ports && - sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && - sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); - struct tcp_sock *tp = tcp_sk(sk); - - if (tcptw->tw_ts_recent_stamp && - (!twp || - (sysctl_tcp_tw_reuse && - xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { - /* See comment in tcp_ipv4.c */ - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (!tp->write_seq) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - sock_hold(sk2); - goto unique; - } else - goto not_unique; - } - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) - goto not_unique; - } - -unique: - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sk->sk_hash = hash; - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &tcp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -static inline u32 tcpv6_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - - return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32, - np->daddr.s6_addr32, - inet->dport); -} - -static int tcp_v6_hash_connect(struct sock *sk) -{ - unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + tcpv6_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__tcp_v6_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __tcp_v6_hash(sk); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &tcp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __tcp_v6_hash(sk); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __tcp_v6_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; - struct inet_sock *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p = NULL, final; @@ -571,7 +202,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, */ if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = tp->ext_header_len; + u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); @@ -583,14 +214,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - tp->af_specific = &ipv6_mapped; + icsk->icsk_af_ops = &ipv6_mapped; sk->sk_backlog_rcv = tcp_v4_do_rcv; err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { - tp->ext_header_len = exthdrlen; - tp->af_specific = &ipv6_specific; + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &ipv6_specific; sk->sk_backlog_rcv = tcp_v6_do_rcv; goto failure; } else { @@ -643,16 +274,17 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - tp->ext_header_len = 0; + icsk->icsk_ext_hdr_len = 0; if (np->opt) - tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); inet->dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); - err = tcp_v6_hash_connect(sk); + err = inet6_hash_connect(&tcp_death_row, sk); if (err) goto late_failure; @@ -758,7 +390,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else dst_hold(dst); - if (tp->pmtu_cookie > dst_mtu(dst)) { + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ @@ -775,8 +407,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sock_owned_by_user(sk)) goto out; - req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr, - &hdr->saddr, inet6_iif(skb)); + req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, + &hdr->saddr, inet6_iif(skb)); if (!req) goto out; @@ -822,7 +454,7 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, struct dst_entry *dst) { - struct tcp6_request_sock *treq = tcp6_rsk(req); + struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff * skb; struct ipv6_txoptions *opt = NULL; @@ -888,8 +520,8 @@ done: static void tcp_v6_reqsk_destructor(struct request_sock *req) { - if (tcp6_rsk(req)->pktopts) - kfree_skb(tcp6_rsk(req)->pktopts); + if (inet6_rsk(req)->pktopts) + kfree_skb(inet6_rsk(req)->pktopts); } static struct request_sock_ops tcp6_request_sock_ops = { @@ -901,26 +533,15 @@ static struct request_sock_ops tcp6_request_sock_ops = { .send_reset = tcp_v6_send_reset }; -static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_skb_parm *opt = IP6CB(skb); - - if (np->rxopt.all) { - if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || - ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && np->rxopt.bits.rxflow) || - (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) || - ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) - return 1; - } - return 0; -} - +static struct timewait_sock_ops tcp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, +}; -static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb) +static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); + struct tcphdr *th = skb->h.th; if (skb->ip_summed == CHECKSUM_HW) { th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); @@ -993,7 +614,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb) if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { - ip6_xmit(NULL, buff, &fl, NULL, 0); + ip6_xmit(tcp6_socket->sk, buff, &fl, NULL, 0); TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); return; @@ -1057,7 +678,7 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { - ip6_xmit(NULL, buff, &fl, NULL, 0); + ip6_xmit(tcp6_socket->sk, buff, &fl, NULL, 0); TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); return; } @@ -1091,8 +712,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) struct sock *nsk; /* Find possible connection requests. */ - req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, inet6_iif(skb)); + req = inet6_csk_search_req(sk, &prev, th->source, + &skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, inet6_iif(skb)); if (req) return tcp_check_req(sk, skb, req, prev); @@ -1116,23 +738,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) return sk; } -static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); - inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); -} - - /* FIXME: this is substantially similar to the ipv4 code. * Can some kind of merge be done? -- erics */ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp6_request_sock *treq; + struct inet6_request_sock *treq; struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); @@ -1157,7 +768,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = reqsk_alloc(&tcp6_request_sock_ops); + req = inet6_reqsk_alloc(&tcp6_request_sock_ops); if (req == NULL) goto drop; @@ -1170,7 +781,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); - treq = tcp6_rsk(req); + treq = inet6_rsk(req); ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr); ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr); TCP_ECN_create_request(req, skb->h.th); @@ -1196,8 +807,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (tcp_v6_send_synack(sk, req, NULL)) goto drop; - tcp_v6_synq_add(sk, req); - + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; drop: @@ -1212,7 +822,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { - struct tcp6_request_sock *treq = tcp6_rsk(req); + struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; @@ -1247,7 +857,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); - newtp->af_specific = &ipv6_mapped; + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; newsk->sk_backlog_rcv = tcp_v4_do_rcv; newnp->pktoptions = NULL; newnp->opt = NULL; @@ -1261,10 +871,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, */ /* It is tricky place. Until this moment IPv4 tcp - worked with IPv6 af_tcp.af_specific. + worked with IPv6 icsk.icsk_af_ops. Sync it now. */ - tcp_sync_mss(newsk, newtp->pmtu_cookie); + tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } @@ -1371,10 +981,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, sock_kfree_s(sk, opt, opt->tot_len); } - newtp->ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newnp->opt) - newtp->ext_header_len = newnp->opt->opt_nflen + - newnp->opt->opt_flen; + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); @@ -1382,7 +992,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; - __tcp_v6_hash(newsk); + __inet6_hash(&tcp_hashinfo, newsk); inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; @@ -1546,7 +1156,7 @@ ipv6_pktoptions: return 0; } -static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +static int tcp_v6_rcv(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct tcphdr *th; @@ -1679,139 +1289,16 @@ do_time_wait: goto discard_it; } -static int tcp_v6_rebuild_header(struct sock *sk) -{ - int err; - struct dst_entry *dst; - struct ipv6_pinfo *np = inet6_sk(sk); - - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - struct inet_sock *inet = inet_sk(sk); - struct in6_addr *final_p = NULL, final; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_dport = inet->dport; - fl.fl_ip_sport = inet->sport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) { - sk->sk_route_caps = 0; - return err; - } - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_err_soft = -err; - return err; - } - - ip6_dst_store(sk, dst, NULL); - sk->sk_route_caps = dst->dev->features & - ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - } - - return 0; -} - -static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok) -{ - struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi fl; - struct dst_entry *dst; - struct in6_addr *final_p = NULL, final; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_sport = inet->sport; - fl.fl_ip_dport = inet->dport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - int err = ip6_dst_lookup(sk, &dst, &fl); - - if (err) { - sk->sk_err_soft = -err; - return err; - } - - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_route_caps = 0; - return err; - } - - ip6_dst_store(sk, dst, NULL); - sk->sk_route_caps = dst->dev->features & - ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - } - - skb->dst = dst_clone(dst); - - /* Restore final destination back after routing done */ - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - - return ip6_xmit(sk, skb, &fl, np->opt, 0); -} - -static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; - - sin6->sin6_family = AF_INET6; - ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); - sin6->sin6_port = inet_sk(sk)->dport; - /* We do not store received flowlabel for TCP */ - sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (sk->sk_bound_dev_if && - ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = sk->sk_bound_dev_if; -} - static int tcp_v6_remember_stamp(struct sock *sk) { /* Alas, not yet... */ return 0; } -static struct tcp_func ipv6_specific = { - .queue_xmit = tcp_v6_xmit, +static struct inet_connection_sock_af_ops ipv6_specific = { + .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, - .rebuild_header = tcp_v6_rebuild_header, + .rebuild_header = inet6_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .remember_stamp = tcp_v6_remember_stamp, @@ -1819,7 +1306,7 @@ static struct tcp_func ipv6_specific = { .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = v6_addr2sockaddr, + .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6) }; @@ -1827,7 +1314,7 @@ static struct tcp_func ipv6_specific = { * TCP over IPv4 via INET6 API */ -static struct tcp_func ipv6_mapped = { +static struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1838,7 +1325,7 @@ static struct tcp_func ipv6_mapped = { .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = v6_addr2sockaddr, + .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6) }; @@ -1877,8 +1364,9 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; - tp->af_specific = &ipv6_specific; + icsk->icsk_af_ops = &ipv6_specific; icsk->icsk_ca_ops = &tcp_init_congestion_ops; + icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); @@ -1900,14 +1388,13 @@ static int tcp_v6_destroy_sock(struct sock *sk) static void get_openreq6(struct seq_file *seq, struct sock *sk, struct request_sock *req, int i, int uid) { - struct in6_addr *dest, *src; int ttd = req->expires - jiffies; + struct in6_addr *src = &inet6_rsk(req)->loc_addr; + struct in6_addr *dest = &inet6_rsk(req)->rmt_addr; if (ttd < 0) ttd = 0; - src = &tcp6_rsk(req)->loc_addr; - dest = &tcp6_rsk(req)->rmt_addr; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", @@ -1988,14 +1475,14 @@ static void get_timewait6_sock(struct seq_file *seq, { struct in6_addr *dest, *src; __u16 destp, srcp; - struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); + struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw); int ttd = tw->tw_ttd - jiffies; if (ttd < 0) ttd = 0; - dest = &tcp6tw->tw_v6_daddr; - src = &tcp6tw->tw_v6_rcv_saddr; + dest = &tw6->tw_v6_daddr; + src = &tw6->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); @@ -2093,7 +1580,7 @@ struct proto tcpv6_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), - .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, }; @@ -2110,13 +1597,27 @@ static struct inet_protosw tcpv6_protosw = { .ops = &inet6_stream_ops, .capability = -1, .no_check = 0, - .flags = INET_PROTOSW_PERMANENT, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, }; void __init tcpv6_init(void) { + int err; + /* register inet6 protocol */ if (inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP) < 0) printk(KERN_ERR "tcpv6_init: Could not register protocol\n"); inet6_register_protosw(&tcpv6_protosw); + + err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_TCP, &tcp6_socket); + if (err < 0) + panic("Failed to create the TCPv6 control socket.\n"); + tcp6_socket->sk->sk_allocation = GFP_ATOMIC; + + /* Unhash it so that IP input processing does not even + * see it, we do not wish this socket to see incoming + * packets. + */ + tcp6_socket->sk->sk_prot->unhash(tcp6_socket->sk); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5cc8731eb55b..c47648892c04 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -36,6 +36,7 @@ #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/init.h> +#include <linux/skbuff.h> #include <asm/uaccess.h> #include <net/sock.h> @@ -300,20 +301,7 @@ out: return err; csum_copy_err: - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } - - skb_free_datagram(sk, skb); + skb_kill_datagram(sk, skb, flags); if (flags & MSG_DONTWAIT) { UDP6_INC_STATS_USER(UDP_MIB_INERRORS); @@ -447,7 +435,7 @@ out: read_unlock(&udp_hash_lock); } -static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +static int udpv6_rcv(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct sock *sk; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 28c29d78338e..1ca2da68ef69 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -11,6 +11,8 @@ #include <linux/module.h> #include <linux/string.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/ip.h> @@ -26,7 +28,7 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) IP6_ECN_set_ce(inner_iph); } -int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi) +int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi) { struct sk_buff *skb = *pskb; int err; @@ -38,7 +40,7 @@ int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi) int nexthdr; unsigned int nhoff; - nhoff = *nhoffp; + nhoff = IP6CB(skb)->nhoff; nexthdr = skb->nh.raw[nhoff]; seq = 0; @@ -121,6 +123,8 @@ int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi) skb->sp->len += xfrm_nr; skb->ip_summed = CHECKSUM_NONE; + nf_reset(skb); + if (decaps) { if (!(skb->dev->flags&IFF_LOOPBACK)) { dst_release(skb->dst); @@ -129,7 +133,16 @@ int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi) netif_rx(skb); return -1; } else { +#ifdef CONFIG_NETFILTER + skb->nh.ipv6h->payload_len = htons(skb->len); + __skb_push(skb, skb->data - skb->nh.raw); + + NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL, + ip6_rcv_finish); + return -1; +#else return 1; +#endif } drop_unlock: @@ -144,7 +157,7 @@ drop: EXPORT_SYMBOL(xfrm6_rcv_spi); -int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +int xfrm6_rcv(struct sk_buff **pskb) { - return xfrm6_rcv_spi(pskb, nhoffp, 0); + return xfrm6_rcv_spi(pskb, 0); } diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 6b9867717d11..80242172a5df 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -9,9 +9,11 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/compiler.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/icmpv6.h> +#include <linux/netfilter_ipv6.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/ipv6.h> @@ -92,7 +94,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) return ret; } -int xfrm6_output(struct sk_buff *skb) +static int xfrm6_output_one(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; @@ -110,29 +112,35 @@ int xfrm6_output(struct sk_buff *skb) goto error_nolock; } - spin_lock_bh(&x->lock); - err = xfrm_state_check(x, skb); - if (err) - goto error; + do { + spin_lock_bh(&x->lock); + err = xfrm_state_check(x, skb); + if (err) + goto error; - xfrm6_encap(skb); + xfrm6_encap(skb); - err = x->type->output(x, skb); - if (err) - goto error; + err = x->type->output(x, skb); + if (err) + goto error; - x->curlft.bytes += skb->len; - x->curlft.packets++; + x->curlft.bytes += skb->len; + x->curlft.packets++; - spin_unlock_bh(&x->lock); + spin_unlock_bh(&x->lock); - skb->nh.raw = skb->data; - - if (!(skb->dst = dst_pop(dst))) { - err = -EHOSTUNREACH; - goto error_nolock; - } - err = NET_XMIT_BYPASS; + skb->nh.raw = skb->data; + + if (!(skb->dst = dst_pop(dst))) { + err = -EHOSTUNREACH; + goto error_nolock; + } + dst = skb->dst; + x = dst->xfrm; + } while (x && !x->props.mode); + + IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; + err = 0; out_exit: return err; @@ -142,3 +150,33 @@ error_nolock: kfree_skb(skb); goto out_exit; } + +static int xfrm6_output_finish(struct sk_buff *skb) +{ + int err; + + while (likely((err = xfrm6_output_one(skb)) == 0)) { + nf_reset(skb); + + err = nf_hook(PF_INET6, NF_IP6_LOCAL_OUT, &skb, NULL, + skb->dst->dev, dst_output); + if (unlikely(err != 1)) + break; + + if (!skb->dst->xfrm) + return dst_output(skb); + + err = nf_hook(PF_INET6, NF_IP6_POST_ROUTING, &skb, NULL, + skb->dst->dev, xfrm6_output_finish); + if (unlikely(err != 1)) + break; + } + + return err; +} + +int xfrm6_output(struct sk_buff *skb) +{ + return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dst->dev, + xfrm6_output_finish); +} diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index bf0d0abc3871..a5723024d3b3 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -15,6 +15,7 @@ #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <net/ipv6.h> +#include <net/addrconf.h> static struct xfrm_state_afinfo xfrm6_state_afinfo; @@ -41,6 +42,22 @@ __xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl, memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); if (ipv6_addr_any((struct in6_addr*)&x->props.saddr)) memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); + if (tmpl->mode && ipv6_addr_any((struct in6_addr*)&x->props.saddr)) { + struct rt6_info *rt; + struct flowi fl_tunnel = { + .nl_u = { + .ip6_u = { + .daddr = *(struct in6_addr *)daddr, + } + } + }; + if (!xfrm_dst_lookup((struct xfrm_dst **)&rt, + &fl_tunnel, AF_INET6)) { + ipv6_get_saddr(&rt->u.dst, (struct in6_addr *)daddr, + (struct in6_addr *)&x->props.saddr); + dst_release(&rt->u.dst); + } + } x->props.mode = tmpl->mode; x->props.reqid = tmpl->reqid; x->props.family = AF_INET6; diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index fbef7826a74f..8cfc58b96fc2 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -259,8 +259,7 @@ try_next_2:; spi = 0; goto out; alloc_spi: - X6TPRINTK3(KERN_DEBUG "%s(): allocate new spi for " - "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + X6TPRINTK3(KERN_DEBUG "%s(): allocate new spi for " NIP6_FMT "\n", __FUNCTION__, NIP6(*(struct in6_addr *)saddr)); x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, SLAB_ATOMIC); @@ -323,9 +322,8 @@ void xfrm6_tunnel_free_spi(xfrm_address_t *saddr) list_byaddr) { if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { - X6TPRINTK3(KERN_DEBUG "%s(): x6spi object " - "for %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x " - "found at %p\n", + X6TPRINTK3(KERN_DEBUG "%s(): x6spi object for " NIP6_FMT + " found at %p\n", __FUNCTION__, NIP6(*(struct in6_addr *)saddr), x6spi); @@ -397,7 +395,7 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler) EXPORT_SYMBOL(xfrm6_tunnel_deregister); -static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +static int xfrm6_tunnel_rcv(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct xfrm6_tunnel *handler = xfrm6_tunnel_handler; @@ -405,11 +403,11 @@ static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp) u32 spi; /* device-like_ip6ip6_handler() */ - if (handler && handler->handler(pskb, nhoffp) == 0) + if (handler && handler->handler(pskb) == 0) return 0; spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(pskb, nhoffp, spi); + return xfrm6_rcv_spi(pskb, spi); } static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 34b3bb868409..0fb513a34d11 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -29,6 +29,7 @@ */ #include <linux/config.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/if_arp.h> #include <linux/if_ether.h> @@ -75,7 +76,7 @@ static struct datalink_proto *pEII_datalink; static struct datalink_proto *p8023_datalink; static struct datalink_proto *pSNAP_datalink; -static struct proto_ops ipx_dgram_ops; +static const struct proto_ops ipx_dgram_ops; LIST_HEAD(ipx_interfaces); DEFINE_SPINLOCK(ipx_interfaces_lock); @@ -1884,7 +1885,7 @@ static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = -EINVAL; break; default: - rc = dev_ioctl(cmd, argp); + rc = -ENOIOCTLCMD; break; } @@ -1901,7 +1902,7 @@ static struct net_proto_family ipx_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { .family = PF_IPX, .owner = THIS_MODULE, .release = ipx_release, diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 6f92f9c62990..759445648667 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -43,6 +43,7 @@ ********************************************************************/ #include <linux/config.h> +#include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/socket.h> @@ -62,12 +63,12 @@ static int irda_create(struct socket *sock, int protocol); -static struct proto_ops irda_stream_ops; -static struct proto_ops irda_seqpacket_ops; -static struct proto_ops irda_dgram_ops; +static const struct proto_ops irda_stream_ops; +static const struct proto_ops irda_seqpacket_ops; +static const struct proto_ops irda_dgram_ops; #ifdef CONFIG_IRDA_ULTRA -static struct proto_ops irda_ultra_ops; +static const struct proto_ops irda_ultra_ops; #define ULTRA_MAX_DATA 382 #endif /* CONFIG_IRDA_ULTRA */ @@ -1438,8 +1439,9 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, /* * POSIX 1003.1g mandates this order. */ - if (sk->sk_err) - ret = sock_error(sk); + ret = sock_error(sk); + if (ret) + break; else if (sk->sk_shutdown & RCV_SHUTDOWN) ; else if (noblock) @@ -1821,7 +1823,7 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return -EINVAL; default: IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__); - return dev_ioctl(cmd, (void __user *) arg); + return -ENOIOCTLCMD; } /*NOTREACHED*/ @@ -2463,7 +2465,7 @@ static struct net_proto_family irda_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2484,7 +2486,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { .sendpage = sock_no_sendpage, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2505,7 +2507,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { .sendpage = sock_no_sendpage, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2527,7 +2529,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { }; #ifdef CONFIG_IRDA_ULTRA -static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c index 70543d89438b..890bac0d4a56 100644 --- a/net/irda/irda_device.c +++ b/net/irda/irda_device.c @@ -33,6 +33,7 @@ #include <linux/string.h> #include <linux/proc_fs.h> #include <linux/skbuff.h> +#include <linux/capability.h> #include <linux/if.h> #include <linux/if_ether.h> #include <linux/if_arp.h> diff --git a/net/irda/iriap.c b/net/irda/iriap.c index b8bb78af8b8a..254f90746900 100644 --- a/net/irda/iriap.c +++ b/net/irda/iriap.c @@ -364,7 +364,7 @@ static void iriap_disconnect_request(struct iriap_cb *self) /* * Function iriap_getvaluebyclass (addr, name, attr) * - * Retreive all values from attribute in all objects with given class + * Retrieve all values from attribute in all objects with given class * name */ int iriap_getvaluebyclass_request(struct iriap_cb *self, diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c index 75f2666e8630..c6d169fbdceb 100644 --- a/net/irda/irias_object.c +++ b/net/irda/irias_object.c @@ -82,8 +82,7 @@ struct ias_object *irias_new_object( char *name, int id) IRDA_DEBUG( 4, "%s()\n", __FUNCTION__); - obj = (struct ias_object *) kmalloc(sizeof(struct ias_object), - GFP_ATOMIC); + obj = kmalloc(sizeof(struct ias_object), GFP_ATOMIC); if (obj == NULL) { IRDA_WARNING("%s(), Unable to allocate object!\n", __FUNCTION__); @@ -348,8 +347,7 @@ void irias_add_integer_attrib(struct ias_object *obj, char *name, int value, IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); IRDA_ASSERT(name != NULL, return;); - attrib = (struct ias_attrib *) kmalloc(sizeof(struct ias_attrib), - GFP_ATOMIC); + attrib = kmalloc(sizeof(struct ias_attrib), GFP_ATOMIC); if (attrib == NULL) { IRDA_WARNING("%s: Unable to allocate attribute!\n", __FUNCTION__); @@ -385,8 +383,7 @@ void irias_add_octseq_attrib(struct ias_object *obj, char *name, __u8 *octets, IRDA_ASSERT(name != NULL, return;); IRDA_ASSERT(octets != NULL, return;); - attrib = (struct ias_attrib *) kmalloc(sizeof(struct ias_attrib), - GFP_ATOMIC); + attrib = kmalloc(sizeof(struct ias_attrib), GFP_ATOMIC); if (attrib == NULL) { IRDA_WARNING("%s: Unable to allocate attribute!\n", __FUNCTION__); @@ -420,8 +417,7 @@ void irias_add_string_attrib(struct ias_object *obj, char *name, char *value, IRDA_ASSERT(name != NULL, return;); IRDA_ASSERT(value != NULL, return;); - attrib = (struct ias_attrib *) kmalloc(sizeof( struct ias_attrib), - GFP_ATOMIC); + attrib = kmalloc(sizeof( struct ias_attrib), GFP_ATOMIC); if (attrib == NULL) { IRDA_WARNING("%s: Unable to allocate attribute!\n", __FUNCTION__); diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h index b391cb3893d4..e4fe1e80029c 100644 --- a/net/irda/irnet/irnet.h +++ b/net/irda/irnet/irnet.h @@ -248,6 +248,7 @@ #include <linux/netdevice.h> #include <linux/miscdevice.h> #include <linux/poll.h> +#include <linux/capability.h> #include <linux/config.h> #include <linux/ctype.h> /* isspace() */ #include <asm/uaccess.h> diff --git a/net/key/af_key.c b/net/key/af_key.c index 39031684b65c..43f1ce74187d 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -15,6 +15,7 @@ */ #include <linux/config.h> +#include <linux/capability.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/socket.h> @@ -113,7 +114,7 @@ static __inline__ void pfkey_unlock_table(void) } -static struct proto_ops pfkey_ops; +static const struct proto_ops pfkey_ops; static void pfkey_insert(struct sock *sk) { @@ -297,8 +298,7 @@ static int pfkey_error(struct sadb_msg *orig, int err, struct sock *sk) err = EINTR; if (err >= 512) err = EINVAL; - if (err <= 0 || err >= 256) - BUG(); + BUG_ON(err <= 0 || err >= 256); hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); pfkey_hdr_dup(hdr, orig); @@ -336,6 +336,7 @@ static u8 sadb_ext_min_len[] = { [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), + [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx), }; /* Verify sadb_address_{len,prefixlen} against sa_family. */ @@ -383,6 +384,55 @@ static int verify_address_len(void *p) return 0; } +static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx) +{ + int len = 0; + + len += sizeof(struct sadb_x_sec_ctx); + len += sec_ctx->sadb_x_ctx_len; + len += sizeof(uint64_t) - 1; + len /= sizeof(uint64_t); + + return len; +} + +static inline int verify_sec_ctx_len(void *p) +{ + struct sadb_x_sec_ctx *sec_ctx = (struct sadb_x_sec_ctx *)p; + int len; + + if (sec_ctx->sadb_x_ctx_len > PAGE_SIZE) + return -EINVAL; + + len = pfkey_sec_ctx_len(sec_ctx); + + if (sec_ctx->sadb_x_sec_len != len) + return -EINVAL; + + return 0; +} + +static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb_x_sec_ctx *sec_ctx) +{ + struct xfrm_user_sec_ctx *uctx = NULL; + int ctx_size = sec_ctx->sadb_x_ctx_len; + + uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL); + + if (!uctx) + return NULL; + + uctx->len = pfkey_sec_ctx_len(sec_ctx); + uctx->exttype = sec_ctx->sadb_x_sec_exttype; + uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi; + uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg; + uctx->ctx_len = sec_ctx->sadb_x_ctx_len; + memcpy(uctx + 1, sec_ctx + 1, + uctx->ctx_len); + + return uctx; +} + static int present_and_same_family(struct sadb_address *src, struct sadb_address *dst) { @@ -438,6 +488,10 @@ static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_h if (verify_address_len(p)) return -EINVAL; } + if (ext_type == SADB_X_EXT_SEC_CTX) { + if (verify_sec_ctx_len(p)) + return -EINVAL; + } ext_hdrs[ext_type-1] = p; } p += ext_len; @@ -586,6 +640,9 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, struct sadb_key *key; struct sadb_x_sa2 *sa2; struct sockaddr_in *sin; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *xfrm_ctx; + int ctx_size = 0; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct sockaddr_in6 *sin6; #endif @@ -609,6 +666,12 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, sizeof(struct sadb_address)*2 + sockaddr_size*2 + sizeof(struct sadb_x_sa2); + + if ((xfrm_ctx = x->security)) { + ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); + size += sizeof(struct sadb_x_sec_ctx) + ctx_size; + } + /* identity & sensitivity */ if ((x->props.family == AF_INET && @@ -899,6 +962,20 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, n_port->sadb_x_nat_t_port_reserved = 0; } + /* security context */ + if (xfrm_ctx) { + sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, + sizeof(struct sadb_x_sec_ctx) + ctx_size); + sec_ctx->sadb_x_sec_len = + (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); + sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; + sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; + sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; + sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; + memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, + xfrm_ctx->ctx_len); + } + return skb; } @@ -909,6 +986,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr, struct sadb_lifetime *lifetime; struct sadb_sa *sa; struct sadb_key *key; + struct sadb_x_sec_ctx *sec_ctx; uint16_t proto; int err; @@ -993,6 +1071,21 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr, x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } + + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) + goto out; + + err = security_xfrm_state_alloc(x, uctx); + kfree(uctx); + + if (err) + goto out; + } + key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; if (sa->sadb_sa_auth) { int keysize = 0; @@ -1720,6 +1813,18 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) return 0; } +static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp) +{ + struct xfrm_sec_ctx *xfrm_ctx = xp->security; + + if (xfrm_ctx) { + int len = sizeof(struct sadb_x_sec_ctx); + len += xfrm_ctx->ctx_len; + return PFKEY_ALIGN8(len); + } + return 0; +} + static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) { int sockaddr_size = pfkey_sockaddr_size(xp->family); @@ -1733,7 +1838,8 @@ static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) (sockaddr_size * 2) + sizeof(struct sadb_x_policy) + (xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) + - (socklen * 2))); + (socklen * 2))) + + pfkey_xfrm_policy2sec_ctx_size(xp); } static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp) @@ -1757,6 +1863,8 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i struct sadb_lifetime *lifetime; struct sadb_x_policy *pol; struct sockaddr_in *sin; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *xfrm_ctx; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct sockaddr_in6 *sin6; #endif @@ -1941,6 +2049,21 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i } } } + + /* security context */ + if ((xfrm_ctx = xp->security)) { + int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp); + + sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size); + sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t); + sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; + sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; + sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; + sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; + memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, + xfrm_ctx->ctx_len); + } + hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_reserved = atomic_read(&xp->refcnt); } @@ -1976,12 +2099,13 @@ out: static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) { - int err; + int err = 0; struct sadb_lifetime *lifetime; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; + struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || @@ -2028,6 +2152,22 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h if (xp->selector.dport) xp->selector.dport_mask = ~0; + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) { + err = -ENOBUFS; + goto out; + } + + err = security_xfrm_policy_alloc(xp, uctx); + kfree(uctx); + + if (err) + goto out; + } + xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; @@ -2051,10 +2191,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, hdr->sadb_msg_type != SADB_X_SPDUPDATE); - if (err) { - kfree(xp); - return err; - } + + if (err) + goto out; if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) c.event = XFRM_MSG_UPDPOLICY; @@ -2069,6 +2208,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h return 0; out: + security_xfrm_policy_free(xp); kfree(xp); return err; } @@ -2078,9 +2218,10 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg int err; struct sadb_address *sa; struct sadb_x_policy *pol; - struct xfrm_policy *xp; + struct xfrm_policy *xp, tmp; struct xfrm_selector sel; struct km_event c; + struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || @@ -2109,7 +2250,24 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg if (sel.dport) sel.dport_mask = ~0; - xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1); + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + memset(&tmp, 0, sizeof(struct xfrm_policy)); + + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) + return -ENOMEM; + + err = security_xfrm_policy_alloc(&tmp, uctx); + kfree(uctx); + + if (err) + return err; + } + + xp = xfrm_policy_bysel_ctx(pol->sadb_x_policy_dir-1, &sel, tmp.security, 1); + security_xfrm_policy_free(&tmp); if (xp == NULL) return -ENOENT; @@ -2660,6 +2818,7 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt, { struct xfrm_policy *xp; struct sadb_x_policy *pol = (struct sadb_x_policy*)data; + struct sadb_x_sec_ctx *sec_ctx; switch (family) { case AF_INET: @@ -2709,10 +2868,32 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt, (*dir = parse_ipsecrequests(xp, pol)) < 0) goto out; + /* security context too */ + if (len >= (pol->sadb_x_policy_len*8 + + sizeof(struct sadb_x_sec_ctx))) { + char *p = (char *)pol; + struct xfrm_user_sec_ctx *uctx; + + p += pol->sadb_x_policy_len*8; + sec_ctx = (struct sadb_x_sec_ctx *)p; + if (len < pol->sadb_x_policy_len*8 + + sec_ctx->sadb_x_sec_len) + goto out; + if ((*dir = verify_sec_ctx_len(p))) + goto out; + uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + *dir = security_xfrm_policy_alloc(xp, uctx); + kfree(uctx); + + if (*dir) + goto out; + } + *dir = pol->sadb_x_policy_dir-1; return xp; out: + security_xfrm_policy_free(xp); kfree(xp); return NULL; } @@ -2946,7 +3127,7 @@ out: return err; } -static struct proto_ops pfkey_ops = { +static const struct proto_ops pfkey_ops = { .family = PF_KEY, .owner = THIS_MODULE, /* Operations that make no sense on pfkey sockets. */ diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index c3f0b0783453..8171c53bc0ed 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -36,7 +36,7 @@ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; -static struct proto_ops llc_ui_ops; +static const struct proto_ops llc_ui_ops; static int llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); @@ -566,10 +566,9 @@ static int llc_wait_data(struct sock *sk, long timeo) /* * POSIX 1003.1g mandates this order. */ - if (sk->sk_err) { - rc = sock_error(sk); + rc = sock_error(sk); + if (rc) break; - } rc = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) break; @@ -960,7 +959,7 @@ out: static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - return dev_ioctl(cmd, (void __user *)arg); + return -ENOIOCTLCMD; } /** @@ -1099,7 +1098,7 @@ static struct net_proto_family llc_ui_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops llc_ui_ops = { +static const struct proto_ops llc_ui_ops = { .family = PF_LLC, .owner = THIS_MODULE, .release = llc_ui_release, diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 794c41d19b28..99c0a0fa4a97 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -95,4 +95,269 @@ config NF_CONNTRACK_FTP To compile it as a module, choose M here. If unsure, say N. +config NF_CT_NETLINK + tristate 'Connection tracking netlink interface (EXPERIMENTAL)' + depends on EXPERIMENTAL && NF_CONNTRACK && NETFILTER_NETLINK + depends on NF_CONNTRACK!=y || NETFILTER_NETLINK!=m + help + This option enables support for a netlink-based userspace interface + endmenu + +config NETFILTER_XTABLES + tristate "Netfilter Xtables support (required for ip_tables)" + help + This is required if you intend to use any of ip_tables, + ip6_tables or arp_tables. + +# alphabetically ordered list of targets + +config NETFILTER_XT_TARGET_CLASSIFY + tristate '"CLASSIFY" target support' + depends on NETFILTER_XTABLES + help + This option adds a `CLASSIFY' target, which enables the user to set + the priority of a packet. Some qdiscs can use this value for + classification, among these are: + + atm, cbq, dsmark, pfifo_fast, htb, prio + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_CONNMARK + tristate '"CONNMARK" target support' + depends on NETFILTER_XTABLES + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) + help + This option adds a `CONNMARK' target, which allows one to manipulate + the connection mark value. Similar to the MARK target, but + affects the connection mark value rather than the packet mark value. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. The module will be called + ipt_CONNMARK.o. If unsure, say `N'. + +config NETFILTER_XT_TARGET_MARK + tristate '"MARK" target support' + depends on NETFILTER_XTABLES + help + This option adds a `MARK' target, which allows you to create rules + in the `mangle' table which alter the netfilter mark (nfmark) field + associated with the packet prior to routing. This can change + the routing method (see `Use netfilter MARK value as routing + key') and can also be used by other subsystems to change their + behavior. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_NFQUEUE + tristate '"NFQUEUE" target Support' + depends on NETFILTER_XTABLES + help + This Target replaced the old obsolete QUEUE target. + + As opposed to QUEUE, it supports 65535 different queues, + not just one. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_NOTRACK + tristate '"NOTRACK" target support' + depends on NETFILTER_XTABLES + depends on IP_NF_RAW || IP6_NF_RAW + depends on IP_NF_CONNTRACK || NF_CONNTRACK + help + The NOTRACK target allows a select rule to specify + which packets *not* to enter the conntrack/NAT + subsystem with all the consequences (no ICMP error tracking, + no protocol helpers for the selected packets). + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_COMMENT + tristate '"comment" match support' + depends on NETFILTER_XTABLES + help + This option adds a `comment' dummy-match, which allows you to put + comments in your iptables ruleset. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_CONNBYTES + tristate '"connbytes" per-connection counter match support' + depends on NETFILTER_XTABLES + depends on (IP_NF_CONNTRACK && IP_NF_CT_ACCT) || NF_CT_ACCT + help + This option adds a `connbytes' match, which allows you to match the + number of bytes and/or packets for each direction within a connection. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_CONNMARK + tristate '"connmark" connection mark match support' + depends on NETFILTER_XTABLES + depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || NF_CONNTRACK_MARK + help + This option adds a `connmark' match, which allows you to match the + connection mark value previously set for the session by `CONNMARK'. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. The module will be called + ipt_connmark.o. If unsure, say `N'. + +config NETFILTER_XT_MATCH_CONNTRACK + tristate '"conntrack" connection tracking match support' + depends on NETFILTER_XTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK + help + This is a general conntrack match module, a superset of the state match. + + It allows matching on additional conntrack information, which is + useful in complex configurations, such as NAT gateways with multiple + internet links or tunnels. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_DCCP + tristate '"DCCP" protocol match support' + depends on NETFILTER_XTABLES + help + With this option enabled, you will be able to use the iptables + `dccp' match in order to match on DCCP source/destination ports + and DCCP flags. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_HELPER + tristate '"helper" match support' + depends on NETFILTER_XTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK + help + Helper matching allows you to match packets in dynamic connections + tracked by a conntrack-helper, ie. ip_conntrack_ftp + + To compile it as a module, choose M here. If unsure, say Y. + +config NETFILTER_XT_MATCH_LENGTH + tristate '"length" match support' + depends on NETFILTER_XTABLES + help + This option allows you to match the length of a packet against a + specific value or range of values. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_LIMIT + tristate '"limit" match support' + depends on NETFILTER_XTABLES + help + limit matching allows you to control the rate at which a rule can be + matched: mainly useful in combination with the LOG target ("LOG + target support", below) and to avoid some Denial of Service attacks. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_MAC + tristate '"mac" address match support' + depends on NETFILTER_XTABLES + help + MAC matching allows you to match packets based on the source + Ethernet address of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_MARK + tristate '"mark" match support' + depends on NETFILTER_XTABLES + help + Netfilter mark matching allows you to match packets based on the + `nfmark' value in the packet. This can be set by the MARK target + (see below). + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_PHYSDEV + tristate '"physdev" match support' + depends on NETFILTER_XTABLES && BRIDGE_NETFILTER + help + Physdev packet matching matches against the physical bridge ports + the IP packet arrived on or will leave by. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_PKTTYPE + tristate '"pkttype" packet type match support' + depends on NETFILTER_XTABLES + help + Packet type matching allows you to match a packet by + its "class", eg. BROADCAST, MULTICAST, ... + + Typical usage: + iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_REALM + tristate '"realm" match support' + depends on NETFILTER_XTABLES + select NET_CLS_ROUTE + help + This option adds a `realm' match, which allows you to use the realm + key from the routing subsystem inside iptables. + + This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option + in tc world. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_SCTP + tristate '"sctp" protocol match support' + depends on NETFILTER_XTABLES + help + With this option enabled, you will be able to use the + `sctp' match in order to match on SCTP source/destination ports + and SCTP chunk types. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_STATE + tristate '"state" match support' + depends on NETFILTER_XTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK + help + Connection state matching allows you to match packets based on their + relationship to a tracked connection (ie. previous packets). This + is a powerful tool for packet classification. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_STRING + tristate '"string" match support' + depends on NETFILTER_XTABLES + select TEXTSEARCH + select TEXTSEARCH_KMP + select TEXTSEARCH_BM + select TEXTSEARCH_FSM + help + This option adds a `string' match, which allows you to look for + pattern matchings in packets. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_TCPMSS + tristate '"tcpmss" match support' + depends on NETFILTER_XTABLES + help + This option adds a `tcpmss' match, which allows you to examine the + MSS value of TCP SYN packets, which control the maximum packet size + for that connection. + + To compile it as a module, choose M here. If unsure, say N. + diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 55f019ad2c08..746172ebc91b 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,4 +1,5 @@ netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o +nf_conntrack-objs := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o obj-$(CONFIG_NETFILTER) = netfilter.o @@ -6,10 +7,43 @@ obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o -nf_conntrack-objs := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o - +# connection tracking obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o -obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o # SCTP protocol connection tracking obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o + +# netlink interface for nf_conntrack +obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o + +# connection tracking helpers +obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o + +# generic X tables +obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o + +# targets +obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o + +# matches +obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNMARK) += xt_connmark.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o +obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o +obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o +obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o +obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o +obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o +obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o +obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o +obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o +obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o +obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a7c7b490cf22..62bb509f05d4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -82,6 +82,8 @@ unsigned int nf_ct_log_invalid; static LIST_HEAD(unconfirmed); static int nf_conntrack_vmalloc; +static unsigned int nf_conntrack_next_id = 1; +static unsigned int nf_conntrack_expect_next_id = 1; #ifdef CONFIG_NF_CONNTRACK_EVENTS struct notifier_block *nf_conntrack_chain; struct notifier_block *nf_conntrack_expect_chain; @@ -184,7 +186,7 @@ DECLARE_MUTEX(nf_ct_cache_mutex); extern struct nf_conntrack_protocol nf_conntrack_generic_protocol; struct nf_conntrack_protocol * -nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol) +__nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol) { if (unlikely(nf_ct_protos[l3proto] == NULL)) return &nf_conntrack_generic_protocol; @@ -192,6 +194,50 @@ nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol) return nf_ct_protos[l3proto][protocol]; } +/* this is guaranteed to always return a valid protocol helper, since + * it falls back to generic_protocol */ +struct nf_conntrack_protocol * +nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol) +{ + struct nf_conntrack_protocol *p; + + preempt_disable(); + p = __nf_ct_proto_find(l3proto, protocol); + if (p) { + if (!try_module_get(p->me)) + p = &nf_conntrack_generic_protocol; + } + preempt_enable(); + + return p; +} + +void nf_ct_proto_put(struct nf_conntrack_protocol *p) +{ + module_put(p->me); +} + +struct nf_conntrack_l3proto * +nf_ct_l3proto_find_get(u_int16_t l3proto) +{ + struct nf_conntrack_l3proto *p; + + preempt_disable(); + p = __nf_ct_l3proto_find(l3proto); + if (p) { + if (!try_module_get(p->me)) + p = &nf_conntrack_generic_l3proto; + } + preempt_enable(); + + return p; +} + +void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p) +{ + module_put(p->me); +} + static int nf_conntrack_hash_rnd_initted; static unsigned int nf_conntrack_hash_rnd; @@ -384,7 +430,7 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, } /* nf_conntrack_expect helper functions */ -static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) +void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { ASSERT_WRITE_LOCK(&nf_conntrack_lock); NF_CT_ASSERT(!timer_pending(&exp->timeout)); @@ -404,6 +450,33 @@ static void expectation_timed_out(unsigned long ul_expect) nf_conntrack_expect_put(exp); } +struct nf_conntrack_expect * +__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i; + + list_for_each_entry(i, &nf_conntrack_expect_list, list) { + if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { + atomic_inc(&i->use); + return i; + } + } + return NULL; +} + +/* Just find a expectation corresponding to a tuple. */ +struct nf_conntrack_expect * +nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i; + + read_lock_bh(&nf_conntrack_lock); + i = __nf_conntrack_expect_find(tuple); + read_unlock_bh(&nf_conntrack_lock); + + return i; +} + /* If an expectation for this connection is found, it gets delete from * global list then returned. */ static struct nf_conntrack_expect * @@ -432,7 +505,7 @@ find_expectation(const struct nf_conntrack_tuple *tuple) } /* delete all expectations for this conntrack */ -static void remove_expectations(struct nf_conn *ct) +void nf_ct_remove_expectations(struct nf_conn *ct) { struct nf_conntrack_expect *i, *tmp; @@ -462,7 +535,7 @@ clean_from_lists(struct nf_conn *ct) LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); /* Destroy all pending expectations */ - remove_expectations(ct); + nf_ct_remove_expectations(ct); } static void @@ -482,12 +555,11 @@ destroy_conntrack(struct nf_conntrack *nfct) /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to nf_conntrack_lock!!! -HW */ - l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num); + l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num); if (l3proto && l3proto->destroy) l3proto->destroy(ct); - proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); if (proto && proto->destroy) proto->destroy(ct); @@ -499,7 +571,7 @@ destroy_conntrack(struct nf_conntrack *nfct) * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, * too. */ - remove_expectations(ct); + nf_ct_remove_expectations(ct); /* We overload first tuple to link into unconfirmed list. */ if (!nf_ct_is_confirmed(ct)) { @@ -540,7 +612,7 @@ conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i, && nf_ct_tuple_equal(tuple, &i->tuple); } -static struct nf_conntrack_tuple_hash * +struct nf_conntrack_tuple_hash * __nf_conntrack_find(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { @@ -575,6 +647,29 @@ nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple, return h; } +static void __nf_conntrack_hash_insert(struct nf_conn *ct, + unsigned int hash, + unsigned int repl_hash) +{ + ct->id = ++nf_conntrack_next_id; + list_prepend(&nf_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_prepend(&nf_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY].list); +} + +void nf_conntrack_hash_insert(struct nf_conn *ct) +{ + unsigned int hash, repl_hash; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + write_lock_bh(&nf_conntrack_lock); + __nf_conntrack_hash_insert(ct, hash, repl_hash); + write_unlock_bh(&nf_conntrack_lock); +} + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff **pskb) @@ -621,10 +716,7 @@ __nf_conntrack_confirm(struct sk_buff **pskb) /* Remove from unconfirmed list */ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - list_prepend(&nf_conntrack_hash[hash], - &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - list_prepend(&nf_conntrack_hash[repl_hash], - &ct->tuplehash[IP_CT_DIR_REPLY]); + __nf_conntrack_hash_insert(ct, hash, repl_hash); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ @@ -708,13 +800,41 @@ static inline int helper_cmp(const struct nf_conntrack_helper *i, } static struct nf_conntrack_helper * -nf_ct_find_helper(const struct nf_conntrack_tuple *tuple) +__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) { return LIST_FIND(&helpers, helper_cmp, struct nf_conntrack_helper *, tuple); } +struct nf_conntrack_helper * +nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_helper *helper; + + /* need nf_conntrack_lock to assure that helper exists until + * try_module_get() is called */ + read_lock_bh(&nf_conntrack_lock); + + helper = __nf_ct_helper_find(tuple); + if (helper) { + /* need to increase module usage count to assure helper will + * not go away while the caller is e.g. busy putting a + * conntrack in the hash that uses the helper */ + if (!try_module_get(helper->me)) + helper = NULL; + } + + read_unlock_bh(&nf_conntrack_lock); + + return helper; +} + +void nf_ct_helper_put(struct nf_conntrack_helper *helper) +{ + module_put(helper->me); +} + static struct nf_conn * __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, @@ -744,7 +864,7 @@ __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, /* find features needed by this conntrack. */ features = l3proto->get_features(orig); read_lock_bh(&nf_conntrack_lock); - if (nf_ct_find_helper(repl) != NULL) + if (__nf_ct_helper_find(repl) != NULL) features |= NF_CT_F_HELP; read_unlock_bh(&nf_conntrack_lock); @@ -794,7 +914,7 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, { struct nf_conntrack_l3proto *l3proto; - l3proto = nf_ct_find_l3proto(orig->src.l3num); + l3proto = __nf_ct_l3proto_find(orig->src.l3num); return __nf_conntrack_alloc(orig, repl, l3proto); } @@ -853,7 +973,7 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, nf_conntrack_get(&conntrack->master->ct_general); NF_CT_STAT_INC(expect_new); } else { - conntrack->helper = nf_ct_find_helper(&repl_tuple); + conntrack->helper = __nf_ct_helper_find(&repl_tuple); NF_CT_STAT_INC(new); } @@ -947,13 +1067,13 @@ nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb) return NF_ACCEPT; } - l3proto = nf_ct_find_l3proto((u_int16_t)pf); + l3proto = __nf_ct_l3proto_find((u_int16_t)pf); if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) { DEBUGP("not prepared to track yet or error occured\n"); return -ret; } - proto = nf_ct_find_proto((u_int16_t)pf, protonum); + proto = __nf_ct_proto_find((u_int16_t)pf, protonum); /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter @@ -1002,9 +1122,9 @@ int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig) { return nf_ct_invert_tuple(inverse, orig, - nf_ct_find_l3proto(orig->src.l3num), - nf_ct_find_proto(orig->src.l3num, - orig->dst.protonum)); + __nf_ct_l3proto_find(orig->src.l3num), + __nf_ct_proto_find(orig->src.l3num, + orig->dst.protonum)); } /* Would two expected things clash? */ @@ -1096,6 +1216,7 @@ static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp) exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; add_timer(&exp->timeout); + exp->id = ++nf_conntrack_expect_next_id; atomic_inc(&exp->use); NF_CT_STAT_INC(expect_create); } @@ -1129,6 +1250,7 @@ static inline int refresh_timer(struct nf_conntrack_expect *i) int nf_conntrack_expect_related(struct nf_conntrack_expect *expect) { struct nf_conntrack_expect *i; + struct nf_conn *master = expect->master; int ret; DEBUGP("nf_conntrack_expect_related %p\n", related_to); @@ -1149,9 +1271,9 @@ int nf_conntrack_expect_related(struct nf_conntrack_expect *expect) } } /* Will be over limit? */ - if (expect->master->helper->max_expected && - expect->master->expecting >= expect->master->helper->max_expected) - evict_oldest_expect(expect->master); + if (master->helper->max_expected && + master->expecting >= master->helper->max_expected) + evict_oldest_expect(master); nf_conntrack_expect_insert(expect); nf_conntrack_expect_event(IPEXP_NEW, expect); @@ -1175,7 +1297,7 @@ void nf_conntrack_alter_reply(struct nf_conn *conntrack, conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (!conntrack->master && conntrack->expecting == 0) - conntrack->helper = nf_ct_find_helper(newreply); + conntrack->helper = __nf_ct_helper_find(newreply); write_unlock_bh(&nf_conntrack_lock); } @@ -1200,6 +1322,19 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) return 0; } +struct nf_conntrack_helper * +__nf_conntrack_helper_find_byname(const char *name) +{ + struct nf_conntrack_helper *h; + + list_for_each_entry(h, &helpers, list) { + if (!strcmp(h->name, name)) + return h; + } + + return NULL; +} + static inline int unhelp(struct nf_conntrack_tuple_hash *i, const struct nf_conntrack_helper *me) { @@ -1283,6 +1418,51 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, nf_conntrack_event_cache(event, skb); } +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be + * in ip_conntrack_core, since we don't want the protocols to autoload + * or depend on ctnetlink */ +int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t), + &tuple->src.u.tcp.port); + NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t), + &tuple->dst.u.tcp.port); + return 0; + +nfattr_failure: + return -1; +} + +static const size_t cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), + [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t) +}; + +int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) + return -EINVAL; + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + t->src.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); + t->dst.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); + + return 0; +} +#endif + /* Used by ipt_REJECT and ip6t_REJECT. */ void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) { @@ -1365,6 +1545,11 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size) get_order(sizeof(struct list_head) * size)); } +void nf_conntrack_flush() +{ + nf_ct_iterate_cleanup(kill_all, NULL); +} + /* Mishearing the voices in his head, our hero wonders how he's supposed to kill the mall. */ void nf_conntrack_cleanup(void) @@ -1378,7 +1563,7 @@ void nf_conntrack_cleanup(void) nf_ct_event_cache_flush(); i_see_dead_people: - nf_ct_iterate_cleanup(kill_all, NULL); + nf_conntrack_flush(); if (atomic_read(&nf_conntrack_count) != 0) { schedule(); goto i_see_dead_people; diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 65080e269f27..ab0c920f0d30 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -44,7 +44,7 @@ static unsigned int ports_c; module_param_array(ports, ushort, &ports_c, 0400); static int loose; -module_param(loose, int, 0600); +module_param(loose, bool, 0600); unsigned int (*nf_nat_ftp_hook)(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, @@ -545,11 +545,11 @@ static int help(struct sk_buff **pskb, different IP address. Simply don't record it for NAT. */ if (cmd.l3num == PF_INET) { - DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n", + DEBUGP("conntrack_ftp: NOT RECORDING: " NIPQUAD_FMT " != " NIPQUAD_FMT "\n", NIPQUAD(cmd.u3.ip), NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip)); } else { - DEBUGP("conntrack_ftp: NOT RECORDING: %x:%x:%x:%x:%x:%x:%x:%x != %x:%x:%x:%x:%x:%x:%x:%x\n", + DEBUGP("conntrack_ftp: NOT RECORDING: " NIP6_FMT " != " NIP6_FMT "\n", NIP6(*((struct in6_addr *)cmd.u3.ip6)), NIP6(*((struct in6_addr *)ct->tuplehash[dir] .tuple.src.u3.ip6))); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c new file mode 100644 index 000000000000..73ab16bc7d40 --- /dev/null +++ b/net/netfilter/nf_conntrack_netlink.c @@ -0,0 +1,1653 @@ +/* Connection tracking via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist <jschlst@samba.org> + * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> + * (C) 2003 by Patrick Mchardy <kaber@trash.net> + * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net> + * + * I've reworked this stuff to use attributes instead of conntrack + * structures. 5.44 am. I need more tea. --pablo 05/07/11. + * + * Initial connection tracking via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + * + * Derived from ip_conntrack_netlink.c: Port by Pablo Neira Ayuso (05/11/14) + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/notifier.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +MODULE_LICENSE("GPL"); + +static char __initdata version[] = "0.92"; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + + +static inline int +ctnetlink_dump_tuples_proto(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_protocol *proto; + int ret = 0; + + NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); + + /* If no protocol helper is found, this function will return the + * generic protocol helper, so proto won't *ever* be NULL */ + proto = nf_ct_proto_find_get(tuple->src.l3num, tuple->dst.protonum); + if (likely(proto->tuple_to_nfattr)) + ret = proto->tuple_to_nfattr(skb, tuple); + + nf_ct_proto_put(proto); + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_tuples(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + struct nfattr *nest_parms; + struct nf_conntrack_l3proto *l3proto; + int ret = 0; + + l3proto = nf_ct_l3proto_find_get(tuple->src.l3num); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_IP); + if (likely(l3proto->tuple_to_nfattr)) + ret = l3proto->tuple_to_nfattr(skb, tuple); + NFA_NEST_END(skb, nest_parms); + + nf_ct_l3proto_put(l3proto); + + if (unlikely(ret < 0)) + return ret; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO); + ret = ctnetlink_dump_tuples_proto(skb, tuple); + NFA_NEST_END(skb, nest_parms); + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) +{ + u_int32_t status = htonl((u_int32_t) ct->status); + NFA_PUT(skb, CTA_STATUS, sizeof(status), &status); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) +{ + long timeout_l = ct->timeout.expires - jiffies; + u_int32_t timeout; + + if (timeout_l < 0) + timeout = 0; + else + timeout = htonl(timeout_l / HZ); + + NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nf_conntrack_protocol *proto = nf_ct_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + struct nfattr *nest_proto; + int ret; + + if (!proto->to_nfattr) { + nf_ct_proto_put(proto); + return 0; + } + + nest_proto = NFA_NEST(skb, CTA_PROTOINFO); + + ret = proto->to_nfattr(skb, nest_proto, ct); + + nf_ct_proto_put(proto); + + NFA_NEST_END(skb, nest_proto); + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nfattr *nest_helper; + + if (!ct->helper) + return 0; + + nest_helper = NFA_NEST(skb, CTA_HELP); + NFA_PUT(skb, CTA_HELP_NAME, strlen(ct->helper->name), ct->helper->name); + + if (ct->helper->to_nfattr) + ct->helper->to_nfattr(skb, ct); + + NFA_NEST_END(skb, nest_helper); + + return 0; + +nfattr_failure: + return -1; +} + +#ifdef CONFIG_NF_CT_ACCT +static inline int +ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, + enum ip_conntrack_dir dir) +{ + enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; + struct nfattr *nest_count = NFA_NEST(skb, type); + u_int32_t tmp; + + tmp = htonl(ct->counters[dir].packets); + NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); + + tmp = htonl(ct->counters[dir].bytes); + NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(u_int32_t), &tmp); + + NFA_NEST_END(skb, nest_count); + + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_counters(a, b, c) (0) +#endif + +#ifdef CONFIG_NF_CONNTRACK_MARK +static inline int +ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) +{ + u_int32_t mark = htonl(ct->mark); + + NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark); + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_mark(a, b) (0) +#endif + +static inline int +ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) +{ + u_int32_t id = htonl(ct->id); + NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) +{ + u_int32_t use = htonl(atomic_read(&ct->ct_general.use)); + + NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use); + return 0; + +nfattr_failure: + return -1; +} + +#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) + +static int +ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, int nowait, + const struct nf_conn *ct) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static int ctnetlink_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + struct nf_conn *ct = (struct nf_conn *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + unsigned int flags = 0, group; + + /* ignore our fake conntrack entry */ + if (ct == &nf_conntrack_untracked) + return NOTIFY_DONE; + + if (events & IPCT_DESTROY) { + type = IPCTNL_MSG_CT_DELETE; + group = NFNLGRP_CONNTRACK_DESTROY; + } else if (events & (IPCT_NEW | IPCT_RELATED)) { + type = IPCTNL_MSG_CT_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + /* dump everything */ + events = ~0UL; + group = NFNLGRP_CONNTRACK_NEW; + } else if (events & (IPCT_STATUS | + IPCT_PROTOINFO | + IPCT_HELPER | + IPCT_HELPINFO | + IPCT_NATINFO)) { + type = IPCTNL_MSG_CT_NEW; + group = NFNLGRP_CONNTRACK_UPDATE; + } else + return NOTIFY_DONE; + + /* FIXME: Check if there are any listeners before, don't hurt performance */ + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + /* NAT stuff is now a status flag */ + if ((events & IPCT_STATUS || events & IPCT_NATINFO) + && ctnetlink_dump_status(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_REFRESH + && ctnetlink_dump_timeout(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_PROTOINFO + && ctnetlink_dump_protoinfo(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_HELPINFO + && ctnetlink_dump_helpinfo(skb, ct) < 0) + goto nfattr_failure; + + if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + nfnetlink_send(skb, 0, group, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif /* CONFIG_NF_CONNTRACK_EVENTS */ + +static int ctnetlink_done(struct netlink_callback *cb) +{ + DEBUGP("entered %s\n", __FUNCTION__); + return 0; +} + +#define L3PROTO(ct) ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num + +static int +ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conn *ct = NULL; + struct nf_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); + u_int8_t l3proto = nfmsg->nfgen_family; + + DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, + cb->args[0], *id); + + read_lock_bh(&nf_conntrack_lock); + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { + h = (struct nf_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = nf_ct_tuplehash_to_ctrack(h); + /* Dump entries of a given L3 protocol number. + * If it is not specified, ie. l3proto == 0, + * then dump everything. */ + if (l3proto && L3PROTO(ct) != l3proto) + continue; + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + } + } +out: + read_unlock_bh(&nf_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} + +#ifdef CONFIG_NF_CT_ACCT +static int +ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conn *ct = NULL; + struct nf_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); + u_int8_t l3proto = nfmsg->nfgen_family; + + DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__, + cb->args[0], *id); + + write_lock_bh(&nf_conntrack_lock); + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { + h = (struct nf_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = nf_ct_tuplehash_to_ctrack(h); + if (l3proto && L3PROTO(ct) != l3proto) + continue; + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + + memset(&ct->counters, 0, sizeof(ct->counters)); + } + } +out: + write_unlock_bh(&nf_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} +#endif + +static inline int +ctnetlink_parse_tuple_ip(struct nfattr *attr, struct nf_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_IP_MAX]; + struct nf_conntrack_l3proto *l3proto; + int ret = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + nfattr_parse_nested(tb, CTA_IP_MAX, attr); + + l3proto = nf_ct_l3proto_find_get(tuple->src.l3num); + + if (likely(l3proto->nfattr_to_tuple)) + ret = l3proto->nfattr_to_tuple(tb, tuple); + + nf_ct_l3proto_put(l3proto); + + DEBUGP("leaving\n"); + + return ret; +} + +static const size_t cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_NUM-1] = sizeof(u_int8_t), +}; + +static inline int +ctnetlink_parse_tuple_proto(struct nfattr *attr, + struct nf_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_PROTO_MAX]; + struct nf_conntrack_protocol *proto; + int ret = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + nfattr_parse_nested(tb, CTA_PROTO_MAX, attr); + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + if (!tb[CTA_PROTO_NUM-1]) + return -EINVAL; + tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + + proto = nf_ct_proto_find_get(tuple->src.l3num, tuple->dst.protonum); + + if (likely(proto->nfattr_to_tuple)) + ret = proto->nfattr_to_tuple(tb, tuple); + + nf_ct_proto_put(proto); + + return ret; +} + +static inline int +ctnetlink_parse_tuple(struct nfattr *cda[], struct nf_conntrack_tuple *tuple, + enum ctattr_tuple type, u_int8_t l3num) +{ + struct nfattr *tb[CTA_TUPLE_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tuple, 0, sizeof(*tuple)); + + nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]); + + if (!tb[CTA_TUPLE_IP-1]) + return -EINVAL; + + tuple->src.l3num = l3num; + + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple); + if (err < 0) + return err; + + if (!tb[CTA_TUPLE_PROTO-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple); + if (err < 0) + return err; + + /* orig and expect tuples get DIR_ORIGINAL */ + if (type == CTA_TUPLE_REPLY) + tuple->dst.dir = IP_CT_DIR_REPLY; + else + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + NF_CT_DUMP_TUPLE(tuple); + + DEBUGP("leaving\n"); + + return 0; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = { + [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), + [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), +}; + +static int ctnetlink_parse_nat_proto(struct nfattr *attr, + const struct nf_conn *ct, + struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_PROTONAT_MAX]; + struct ip_nat_protocol *npt; + + DEBUGP("entered %s\n", __FUNCTION__); + + nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); + + if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) + return -EINVAL; + + npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + + if (!npt->nfattr_to_range) { + ip_nat_proto_put(npt); + return 0; + } + + /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */ + if (npt->nfattr_to_range(tb, range) > 0) + range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + + ip_nat_proto_put(npt); + + DEBUGP("leaving\n"); + return 0; +} + +static const size_t cta_min_nat[CTA_NAT_MAX] = { + [CTA_NAT_MINIP-1] = sizeof(u_int32_t), + [CTA_NAT_MAXIP-1] = sizeof(u_int32_t), +}; + +static inline int +ctnetlink_parse_nat(struct nfattr *cda[], + const struct nf_conn *ct, struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_NAT_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(range, 0, sizeof(*range)); + + nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + + if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) + return -EINVAL; + + if (tb[CTA_NAT_MINIP-1]) + range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); + + if (!tb[CTA_NAT_MAXIP-1]) + range->max_ip = range->min_ip; + else + range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]); + + if (range->min_ip) + range->flags |= IP_NAT_RANGE_MAP_IPS; + + if (!tb[CTA_NAT_PROTO-1]) + return 0; + + err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range); + if (err < 0) + return err; + + DEBUGP("leaving\n"); + return 0; +} +#endif + +static inline int +ctnetlink_parse_help(struct nfattr *attr, char **helper_name) +{ + struct nfattr *tb[CTA_HELP_MAX]; + + DEBUGP("entered %s\n", __FUNCTION__); + + nfattr_parse_nested(tb, CTA_HELP_MAX, attr); + + if (!tb[CTA_HELP_NAME-1]) + return -EINVAL; + + *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); + + return 0; +} + +static const size_t cta_min[CTA_MAX] = { + [CTA_STATUS-1] = sizeof(u_int32_t), + [CTA_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_MARK-1] = sizeof(u_int32_t), + [CTA_USE-1] = sizeof(u_int32_t), + [CTA_ID-1] = sizeof(u_int32_t) +}; + +static int +ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + else { + /* Flush the whole table */ + nf_conntrack_flush(); + return 0; + } + + if (err < 0) + return err; + + h = nf_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash\n"); + return -ENOENT; + } + + ct = nf_ct_tuplehash_to_ctrack(h); + + if (cda[CTA_ID-1]) { + u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1])); + if (ct->id != id) { + nf_ct_put(ct); + return -ENOENT; + } + } + if (del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + + nf_ct_put(ct); + DEBUGP("leaving\n"); + + return 0; +} + +static int +ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + struct sk_buff *skb2 = NULL; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + u32 rlen; + + if (NFNL_MSG_TYPE(nlh->nlmsg_type) == + IPCTNL_MSG_CT_GET_CTRZERO) { +#ifdef CONFIG_NF_CT_ACCT + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table_w, + ctnetlink_done)) != 0) + return -EINVAL; +#else + return -ENOTSUPP; +#endif + } else { + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + } + + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + else + return -EINVAL; + + if (err < 0) + return err; + + h = nf_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash"); + return -ENOENT; + } + DEBUGP("tuple found\n"); + ct = nf_ct_tuplehash_to_ctrack(h); + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) { + nf_ct_put(ct); + return -ENOMEM; + } + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, 1, ct); + nf_ct_put(ct); + if (err <= 0) + goto free; + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto out; + + DEBUGP("leaving\n"); + return 0; + +free: + kfree_skb(skb2); +out: + return err; +} + +static inline int +ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[]) +{ + unsigned long d; + unsigned status = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1])); + d = ct->status ^ status; + + if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) + /* unchangeable */ + return -EINVAL; + + if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) + /* SEEN_REPLY bit can only be set */ + return -EINVAL; + + + if (d & IPS_ASSURED && !(status & IPS_ASSURED)) + /* ASSURED bit can only be set */ + return -EINVAL; + + if (cda[CTA_NAT-1]) { +#ifndef CONFIG_IP_NF_NAT_NEEDED + return -EINVAL; +#else + unsigned int hooknum; + struct ip_nat_range range; + + if (ctnetlink_parse_nat(cda, ct, &range) < 0) + return -EINVAL; + + DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", + NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), + htons(range.min.all), htons(range.max.all)); + + /* This is tricky but it works. ip_nat_setup_info needs the + * hook number as parameter, so let's do the correct + * conversion and run away */ + if (status & IPS_SRC_NAT_DONE) + hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ + else if (status & IPS_DST_NAT_DONE) + hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ + else + return -EINVAL; /* Missing NAT flags */ + + DEBUGP("NAT status: %lu\n", + status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + + if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + + DEBUGP("NAT status after setup_info: %lu\n", + ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); +#endif + } + + /* Be careful here, modifying NAT bits can screw up things, + * so don't let users modify them directly if they don't pass + * ip_nat_range. */ + ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + return 0; +} + + +static inline int +ctnetlink_change_helper(struct nf_conn *ct, struct nfattr *cda[]) +{ + struct nf_conntrack_helper *helper; + char *helpname; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + /* don't change helper of sibling connections */ + if (ct->master) + return -EINVAL; + + err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname); + if (err < 0) + return err; + + helper = __nf_conntrack_helper_find_byname(helpname); + if (!helper) { + if (!strcmp(helpname, "")) + helper = NULL; + else + return -EINVAL; + } + + if (ct->helper) { + if (!helper) { + /* we had a helper before ... */ + nf_ct_remove_expectations(ct); + ct->helper = NULL; + } else { + /* need to zero data of old helper */ + memset(&ct->help, 0, sizeof(ct->help)); + } + } + + ct->helper = helper; + + return 0; +} + +static inline int +ctnetlink_change_timeout(struct nf_conn *ct, struct nfattr *cda[]) +{ + u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + if (!del_timer(&ct->timeout)) + return -ETIME; + + ct->timeout.expires = jiffies + timeout * HZ; + add_timer(&ct->timeout); + + return 0; +} + +static inline int +ctnetlink_change_protoinfo(struct nf_conn *ct, struct nfattr *cda[]) +{ + struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1]; + struct nf_conntrack_protocol *proto; + u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + u_int16_t l3num = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + int err = 0; + + nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); + + proto = nf_ct_proto_find_get(l3num, npt); + + if (proto->from_nfattr) + err = proto->from_nfattr(tb, ct); + nf_ct_proto_put(proto); + + return err; +} + +static int +ctnetlink_change_conntrack(struct nf_conn *ct, struct nfattr *cda[]) +{ + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_HELP-1]) { + err = ctnetlink_change_helper(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_TIMEOUT-1]) { + err = ctnetlink_change_timeout(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_STATUS-1]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_PROTOINFO-1]) { + err = ctnetlink_change_protoinfo(ct, cda); + if (err < 0) + return err; + } + +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + + DEBUGP("all done\n"); + return 0; +} + +static int +ctnetlink_create_conntrack(struct nfattr *cda[], + struct nf_conntrack_tuple *otuple, + struct nf_conntrack_tuple *rtuple) +{ + struct nf_conn *ct; + int err = -EINVAL; + + DEBUGP("entered %s\n", __FUNCTION__); + + ct = nf_conntrack_alloc(otuple, rtuple); + if (ct == NULL || IS_ERR(ct)) + return -ENOMEM; + + if (!cda[CTA_TIMEOUT-1]) + goto err; + ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + ct->status |= IPS_CONFIRMED; + + err = ctnetlink_change_status(ct, cda); + if (err < 0) + goto err; + + if (cda[CTA_PROTOINFO-1]) { + err = ctnetlink_change_protoinfo(ct, cda); + if (err < 0) + return err; + } + +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + + ct->helper = nf_ct_helper_find_get(rtuple); + + add_timer(&ct->timeout); + nf_conntrack_hash_insert(ct); + + if (ct->helper) + nf_ct_helper_put(ct->helper); + + DEBUGP("conntrack with id %u inserted\n", ct->id); + return 0; + +err: + nf_conntrack_free(ct); + return err; +} + +static int +ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_tuple otuple, rtuple; + struct nf_conntrack_tuple_hash *h = NULL; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + + if (cda[CTA_TUPLE_ORIG-1]) { + err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3); + if (err < 0) + return err; + } + + if (cda[CTA_TUPLE_REPLY-1]) { + err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3); + if (err < 0) + return err; + } + + write_lock_bh(&nf_conntrack_lock); + if (cda[CTA_TUPLE_ORIG-1]) + h = __nf_conntrack_find(&otuple, NULL); + else if (cda[CTA_TUPLE_REPLY-1]) + h = __nf_conntrack_find(&rtuple, NULL); + + if (h == NULL) { + write_unlock_bh(&nf_conntrack_lock); + DEBUGP("no such conntrack, create new\n"); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); + return err; + } + /* implicit 'else' */ + + /* we only allow nat config for new conntracks */ + if (cda[CTA_NAT-1]) { + err = -EINVAL; + goto out_unlock; + } + + /* We manipulate the conntrack inside the global conntrack table lock, + * so there's no need to increase the refcount */ + DEBUGP("conntrack found\n"); + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_conntrack(nf_ct_tuplehash_to_ctrack(h), cda); + +out_unlock: + write_unlock_bh(&nf_conntrack_lock); + return err; +} + +/*********************************************************************** + * EXPECT + ***********************************************************************/ + +static inline int +ctnetlink_exp_dump_tuple(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + enum ctattr_expect type) +{ + struct nfattr *nest_parms = NFA_NEST(skb, type); + + if (ctnetlink_dump_tuples(skb, tuple) < 0) + goto nfattr_failure; + + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_exp_dump_expect(struct sk_buff *skb, + const struct nf_conntrack_expect *exp) +{ + struct nf_conn *master = exp->master; + u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ); + u_int32_t id = htonl(exp->id); + + if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) + goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0) + goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, + &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + CTA_EXPECT_MASTER) < 0) + goto nfattr_failure; + + NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout); + NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id); + + return 0; + +nfattr_failure: + return -1; +} + +static int +ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, + int nowait, + const struct nf_conntrack_expect *exp) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = exp->tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static int ctnetlink_expect_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nf_conntrack_expect *exp = (struct nf_conntrack_expect *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + int flags = 0; + + if (events & IPEXP_NEW) { + type = IPCTNL_MSG_EXP_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + } else + return NOTIFY_DONE; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = exp->tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif + +static int +ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conntrack_expect *exp = NULL; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[0]; + struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); + u_int8_t l3proto = nfmsg->nfgen_family; + + DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); + + read_lock_bh(&nf_conntrack_lock); + list_for_each_prev(i, &nf_conntrack_expect_list) { + exp = (struct nf_conntrack_expect *) i; + if (l3proto && exp->tuple.src.l3num != l3proto) + continue; + if (exp->id <= *id) + continue; + if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + 1, exp) < 0) + goto out; + *id = exp->id; + } +out: + read_unlock_bh(&nf_conntrack_lock); + + DEBUGP("leaving, last id=%llu\n", *id); + + return skb->len; +} + +static const size_t cta_min_exp[CTA_EXPECT_MAX] = { + [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_EXPECT_ID-1] = sizeof(u_int32_t) +}; + +static int +ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; + struct sk_buff *skb2; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + u32 rlen; + + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_exp_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (cda[CTA_EXPECT_MASTER-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + else + return -EINVAL; + + if (err < 0) + return err; + + exp = nf_conntrack_expect_find(&tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + nf_conntrack_expect_put(exp); + return -ENOENT; + } + } + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + goto out; + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + 1, exp); + if (err <= 0) + goto free; + + nf_conntrack_expect_put(exp); + + return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + +free: + kfree_skb(skb2); +out: + nf_conntrack_expect_put(exp); + return err; +} + +static int +ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_expect *exp, *tmp; + struct nf_conntrack_tuple tuple; + struct nf_conntrack_helper *h; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err; + + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + + if (cda[CTA_EXPECT_TUPLE-1]) { + /* delete a single expect by tuple */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + + /* bump usage count to 2 */ + exp = nf_conntrack_expect_find(&tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = + *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + nf_conntrack_expect_put(exp); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ + nf_conntrack_unexpect_related(exp); + /* have to put what we 'get' above. + * after this line usage count == 0 */ + nf_conntrack_expect_put(exp); + } else if (cda[CTA_EXPECT_HELP_NAME-1]) { + char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]); + + /* delete all expectations for this helper */ + write_lock_bh(&nf_conntrack_lock); + h = __nf_conntrack_helper_find_byname(name); + if (!h) { + write_unlock_bh(&nf_conntrack_lock); + return -EINVAL; + } + list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, + list) { + if (exp->master->helper == h + && del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_conntrack_expect_put(exp); + } + } + write_unlock_bh(&nf_conntrack_lock); + } else { + /* This basically means we have to flush everything*/ + write_lock_bh(&nf_conntrack_lock); + list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, + list) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_conntrack_expect_put(exp); + } + } + write_unlock_bh(&nf_conntrack_lock); + } + + return 0; +} +static int +ctnetlink_change_expect(struct nf_conntrack_expect *x, struct nfattr *cda[]) +{ + return -EOPNOTSUPP; +} + +static int +ctnetlink_create_expect(struct nfattr *cda[], u_int8_t u3) +{ + struct nf_conntrack_tuple tuple, mask, master_tuple; + struct nf_conntrack_tuple_hash *h = NULL; + struct nf_conntrack_expect *exp; + struct nf_conn *ct; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + /* caller guarantees that those three CTA_EXPECT_* exist */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); + if (err < 0) + return err; + + /* Look for master conntrack of this expectation */ + h = nf_conntrack_find_get(&master_tuple, NULL); + if (!h) + return -ENOENT; + ct = nf_ct_tuplehash_to_ctrack(h); + + if (!ct->helper) { + /* such conntrack hasn't got any helper, abort */ + err = -EINVAL; + goto out; + } + + exp = nf_conntrack_expect_alloc(ct); + if (!exp) { + err = -ENOMEM; + goto out; + } + + exp->expectfn = NULL; + exp->flags = 0; + exp->master = ct; + memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple)); + memcpy(&exp->mask, &mask, sizeof(struct nf_conntrack_tuple)); + + err = nf_conntrack_expect_related(exp); + nf_conntrack_expect_put(exp); + +out: + nf_ct_put(nf_ct_tuplehash_to_ctrack(h)); + return err; +} + +static int +ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + + if (!cda[CTA_EXPECT_TUPLE-1] + || !cda[CTA_EXPECT_MASK-1] + || !cda[CTA_EXPECT_MASTER-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + + write_lock_bh(&nf_conntrack_lock); + exp = __nf_conntrack_expect_find(&tuple); + + if (!exp) { + write_unlock_bh(&nf_conntrack_lock); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_expect(cda, u3); + return err; + } + + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_expect(exp, cda); + write_unlock_bh(&nf_conntrack_lock); + + DEBUGP("leaving\n"); + + return err; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static struct notifier_block ctnl_notifier = { + .notifier_call = ctnetlink_conntrack_event, +}; + +static struct notifier_block ctnl_notifier_exp = { + .notifier_call = ctnetlink_expect_event, +}; +#endif + +static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { + [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, + .attr_count = CTA_MAX, }, + [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, }, + [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, + .attr_count = CTA_MAX, }, + [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, }, +}; + +static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { + [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, + .attr_count = CTA_EXPECT_MAX, }, + [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, + .attr_count = CTA_EXPECT_MAX, }, + [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, + .attr_count = CTA_EXPECT_MAX, }, +}; + +static struct nfnetlink_subsystem ctnl_subsys = { + .name = "conntrack", + .subsys_id = NFNL_SUBSYS_CTNETLINK, + .cb_count = IPCTNL_MSG_MAX, + .cb = ctnl_cb, +}; + +static struct nfnetlink_subsystem ctnl_exp_subsys = { + .name = "conntrack_expect", + .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, + .cb_count = IPCTNL_MSG_EXP_MAX, + .cb = ctnl_exp_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); + +static int __init ctnetlink_init(void) +{ + int ret; + + printk("ctnetlink v%s: registering with nfnetlink.\n", version); + ret = nfnetlink_subsys_register(&ctnl_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register with nfnetlink.\n"); + goto err_out; + } + + ret = nfnetlink_subsys_register(&ctnl_exp_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register exp with nfnetlink.\n"); + goto err_unreg_subsys; + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + ret = nf_conntrack_register_notifier(&ctnl_notifier); + if (ret < 0) { + printk("ctnetlink_init: cannot register notifier.\n"); + goto err_unreg_exp_subsys; + } + + ret = nf_conntrack_expect_register_notifier(&ctnl_notifier_exp); + if (ret < 0) { + printk("ctnetlink_init: cannot expect register notifier.\n"); + goto err_unreg_notifier; + } +#endif + + return 0; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +err_unreg_notifier: + nf_conntrack_unregister_notifier(&ctnl_notifier); +err_unreg_exp_subsys: + nfnetlink_subsys_unregister(&ctnl_exp_subsys); +#endif +err_unreg_subsys: + nfnetlink_subsys_unregister(&ctnl_subsys); +err_out: + return ret; +} + +static void __exit ctnetlink_exit(void) +{ + printk("ctnetlink: unregistering from nfnetlink.\n"); + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_conntrack_unregister_notifier(&ctnl_notifier_exp); + nf_conntrack_unregister_notifier(&ctnl_notifier); +#endif + + nfnetlink_subsys_unregister(&ctnl_exp_subsys); + nfnetlink_subsys_unregister(&ctnl_subsys); + return; +} + +module_init(ctnetlink_init); +module_exit(ctnetlink_exit); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 36425f6c833f..46bc27e2756d 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -17,7 +17,7 @@ #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack_protocol.h> -unsigned long nf_ct_generic_timeout = 600*HZ; +unsigned int nf_ct_generic_timeout = 600*HZ; static int generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 3a600f77b4e0..cf798e61e379 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -62,15 +62,15 @@ static const char *sctp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -static unsigned long nf_ct_sctp_timeout_closed = 10 SECS; -static unsigned long nf_ct_sctp_timeout_cookie_wait = 3 SECS; -static unsigned long nf_ct_sctp_timeout_cookie_echoed = 3 SECS; -static unsigned long nf_ct_sctp_timeout_established = 5 DAYS; -static unsigned long nf_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; -static unsigned long nf_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; -static unsigned long nf_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; - -static unsigned long * sctp_timeouts[] +static unsigned int nf_ct_sctp_timeout_closed = 10 SECS; +static unsigned int nf_ct_sctp_timeout_cookie_wait = 3 SECS; +static unsigned int nf_ct_sctp_timeout_cookie_echoed = 3 SECS; +static unsigned int nf_ct_sctp_timeout_established = 5 DAYS; +static unsigned int nf_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; +static unsigned int nf_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; +static unsigned int nf_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; + +static unsigned int * sctp_timeouts[] = { NULL, /* SCTP_CONNTRACK_NONE */ &nf_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ &nf_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 6035633d8225..df99138c3b3b 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -93,21 +93,21 @@ static const char *tcp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -unsigned long nf_ct_tcp_timeout_syn_sent = 2 MINS; -unsigned long nf_ct_tcp_timeout_syn_recv = 60 SECS; -unsigned long nf_ct_tcp_timeout_established = 5 DAYS; -unsigned long nf_ct_tcp_timeout_fin_wait = 2 MINS; -unsigned long nf_ct_tcp_timeout_close_wait = 60 SECS; -unsigned long nf_ct_tcp_timeout_last_ack = 30 SECS; -unsigned long nf_ct_tcp_timeout_time_wait = 2 MINS; -unsigned long nf_ct_tcp_timeout_close = 10 SECS; +unsigned int nf_ct_tcp_timeout_syn_sent = 2 MINS; +unsigned int nf_ct_tcp_timeout_syn_recv = 60 SECS; +unsigned int nf_ct_tcp_timeout_established = 5 DAYS; +unsigned int nf_ct_tcp_timeout_fin_wait = 2 MINS; +unsigned int nf_ct_tcp_timeout_close_wait = 60 SECS; +unsigned int nf_ct_tcp_timeout_last_ack = 30 SECS; +unsigned int nf_ct_tcp_timeout_time_wait = 2 MINS; +unsigned int nf_ct_tcp_timeout_close = 10 SECS; /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ -unsigned long nf_ct_tcp_timeout_max_retrans = 5 MINS; +unsigned int nf_ct_tcp_timeout_max_retrans = 5 MINS; -static unsigned long * tcp_timeouts[] +static unsigned int * tcp_timeouts[] = { NULL, /* TCP_CONNTRACK_NONE */ &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ @@ -988,7 +988,7 @@ static int tcp_packet(struct nf_conn *conntrack, || (!test_bit(IPS_ASSURED_BIT, &conntrack->status) && conntrack->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { - /* RST sent to invalid SYN or ACK we had let trough + /* RST sent to invalid SYN or ACK we had let through * at a) and c) above: * * a) SYN was in window then @@ -999,7 +999,7 @@ static int tcp_packet(struct nf_conn *conntrack, * segments we ignored. */ goto in_window; } - /* Just fall trough */ + /* Just fall through */ default: /* Keep compilers happy. */ break; @@ -1147,6 +1147,63 @@ static int tcp_new(struct nf_conn *conntrack, receiver->td_scale); return 1; } + +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, + const struct nf_conn *ct) +{ + struct nfattr *nest_parms; + + read_lock_bh(&tcp_lock); + nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); + NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), + &ct->proto.tcp.state); + read_unlock_bh(&tcp_lock); + + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + read_unlock_bh(&tcp_lock); + return -1; +} + +static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { + [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), +}; + +static int nfattr_to_tcp(struct nfattr *cda[], struct nf_conn *ct) +{ + struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; + struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; + + /* updates could not contain anything about the private + * protocol info, in that case skip the parsing */ + if (!attr) + return 0; + + nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); + + if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp)) + return -EINVAL; + + if (!tb[CTA_PROTOINFO_TCP_STATE-1]) + return -EINVAL; + + write_lock_bh(&tcp_lock); + ct->proto.tcp.state = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]); + write_unlock_bh(&tcp_lock); + + return 0; +} +#endif struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 = { @@ -1160,6 +1217,13 @@ struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 = .packet = tcp_packet, .new = tcp_new, .error = tcp_error4, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .to_nfattr = tcp_to_nfattr, + .from_nfattr = nfattr_to_tcp, + .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, +#endif }; struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 = @@ -1174,6 +1238,13 @@ struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 = .packet = tcp_packet, .new = tcp_new, .error = tcp_error6, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .to_nfattr = tcp_to_nfattr, + .from_nfattr = nfattr_to_tcp, + .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, +#endif }; EXPORT_SYMBOL(nf_conntrack_protocol_tcp4); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3cae7ce420dd..4264dd079a16 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -27,8 +27,8 @@ #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack_protocol.h> -unsigned long nf_ct_udp_timeout = 30*HZ; -unsigned long nf_ct_udp_timeout_stream = 180*HZ; +unsigned int nf_ct_udp_timeout = 30*HZ; +unsigned int nf_ct_udp_timeout_stream = 180*HZ; static int udp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, @@ -196,6 +196,11 @@ struct nf_conntrack_protocol nf_conntrack_protocol_udp4 = .packet = udp_packet, .new = udp_new, .error = udp_error4, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, +#endif }; struct nf_conntrack_protocol nf_conntrack_protocol_udp6 = @@ -210,6 +215,11 @@ struct nf_conntrack_protocol nf_conntrack_protocol_udp6 = .packet = udp_packet, .new = udp_new, .error = udp_error6, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, +#endif }; EXPORT_SYMBOL(nf_conntrack_protocol_udp4); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 5af381f9fe3d..617599aeeead 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -161,14 +161,14 @@ static int ct_seq_show(struct seq_file *s, void *v) if (NF_CT_DIRECTION(hash)) return 0; - l3proto = nf_ct_find_l3proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.src.l3num); + l3proto = __nf_ct_l3proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src.l3num); NF_CT_ASSERT(l3proto); - proto = nf_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.src.l3num, - conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum); + proto = __nf_ct_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src.l3num, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); NF_CT_ASSERT(proto); if (seq_printf(s, "%-8s %u %-8s %u %ld ", @@ -307,9 +307,9 @@ static int exp_seq_show(struct seq_file *s, void *v) expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - nf_ct_find_l3proto(expect->tuple.src.l3num), - nf_ct_find_proto(expect->tuple.src.l3num, - expect->tuple.dst.protonum)); + __nf_ct_l3proto_find(expect->tuple.src.l3num), + __nf_ct_proto_find(expect->tuple.src.l3num, + expect->tuple.dst.protonum)); return seq_putc(s, '\n'); } @@ -431,25 +431,25 @@ extern int nf_conntrack_max; extern unsigned int nf_conntrack_htable_size; /* From nf_conntrack_proto_tcp.c */ -extern unsigned long nf_ct_tcp_timeout_syn_sent; -extern unsigned long nf_ct_tcp_timeout_syn_recv; -extern unsigned long nf_ct_tcp_timeout_established; -extern unsigned long nf_ct_tcp_timeout_fin_wait; -extern unsigned long nf_ct_tcp_timeout_close_wait; -extern unsigned long nf_ct_tcp_timeout_last_ack; -extern unsigned long nf_ct_tcp_timeout_time_wait; -extern unsigned long nf_ct_tcp_timeout_close; -extern unsigned long nf_ct_tcp_timeout_max_retrans; +extern unsigned int nf_ct_tcp_timeout_syn_sent; +extern unsigned int nf_ct_tcp_timeout_syn_recv; +extern unsigned int nf_ct_tcp_timeout_established; +extern unsigned int nf_ct_tcp_timeout_fin_wait; +extern unsigned int nf_ct_tcp_timeout_close_wait; +extern unsigned int nf_ct_tcp_timeout_last_ack; +extern unsigned int nf_ct_tcp_timeout_time_wait; +extern unsigned int nf_ct_tcp_timeout_close; +extern unsigned int nf_ct_tcp_timeout_max_retrans; extern int nf_ct_tcp_loose; extern int nf_ct_tcp_be_liberal; extern int nf_ct_tcp_max_retrans; /* From nf_conntrack_proto_udp.c */ -extern unsigned long nf_ct_udp_timeout; -extern unsigned long nf_ct_udp_timeout_stream; +extern unsigned int nf_ct_udp_timeout; +extern unsigned int nf_ct_udp_timeout_stream; /* From nf_conntrack_proto_generic.c */ -extern unsigned long nf_ct_generic_timeout; +extern unsigned int nf_ct_generic_timeout; /* Log invalid packets of a given protocol */ static int log_invalid_proto_min = 0; @@ -821,7 +821,7 @@ module_exit(fini); /* Some modules need us, but don't depend directly on any symbol. They should call this. */ -void need_nf_conntrack(void) +void need_conntrack(void) { } @@ -841,13 +841,17 @@ EXPORT_SYMBOL(nf_conntrack_protocol_unregister); EXPORT_SYMBOL(nf_ct_invert_tuplepr); EXPORT_SYMBOL(nf_conntrack_alter_reply); EXPORT_SYMBOL(nf_conntrack_destroyed); -EXPORT_SYMBOL(need_nf_conntrack); +EXPORT_SYMBOL(need_conntrack); EXPORT_SYMBOL(nf_conntrack_helper_register); EXPORT_SYMBOL(nf_conntrack_helper_unregister); EXPORT_SYMBOL(nf_ct_iterate_cleanup); EXPORT_SYMBOL(__nf_ct_refresh_acct); EXPORT_SYMBOL(nf_ct_protos); -EXPORT_SYMBOL(nf_ct_find_proto); +EXPORT_SYMBOL(__nf_ct_proto_find); +EXPORT_SYMBOL(nf_ct_proto_find_get); +EXPORT_SYMBOL(nf_ct_proto_put); +EXPORT_SYMBOL(nf_ct_l3proto_find_get); +EXPORT_SYMBOL(nf_ct_l3proto_put); EXPORT_SYMBOL(nf_ct_l3protos); EXPORT_SYMBOL(nf_conntrack_expect_alloc); EXPORT_SYMBOL(nf_conntrack_expect_put); @@ -867,3 +871,21 @@ EXPORT_SYMBOL(nf_ct_get_tuple); EXPORT_SYMBOL(nf_ct_invert_tuple); EXPORT_SYMBOL(nf_conntrack_in); EXPORT_SYMBOL(__nf_conntrack_attach); +EXPORT_SYMBOL(nf_conntrack_alloc); +EXPORT_SYMBOL(nf_conntrack_free); +EXPORT_SYMBOL(nf_conntrack_flush); +EXPORT_SYMBOL(nf_ct_remove_expectations); +EXPORT_SYMBOL(nf_ct_helper_find_get); +EXPORT_SYMBOL(nf_ct_helper_put); +EXPORT_SYMBOL(__nf_conntrack_helper_find_byname); +EXPORT_SYMBOL(__nf_conntrack_find); +EXPORT_SYMBOL(nf_ct_unlink_expect); +EXPORT_SYMBOL(nf_conntrack_hash_insert); +EXPORT_SYMBOL(__nf_conntrack_expect_find); +EXPORT_SYMBOL(nf_conntrack_expect_find); +EXPORT_SYMBOL(nf_conntrack_expect_list); +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) +EXPORT_SYMBOL(nf_ct_port_tuple_to_nfattr); +EXPORT_SYMBOL(nf_ct_port_nfattr_to_tuple); +#endif diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 95fdf04f1d88..f6063e8f0050 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -212,7 +212,7 @@ int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags) } /* Process one complete nfnetlink message. */ -static inline int nfnetlink_rcv_msg(struct sk_buff *skb, +static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) { struct nfnl_callback *nc; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index cba63729313d..e10512e229b6 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -151,7 +151,7 @@ instance_create(u_int16_t group_num, int pid) goto out_unlock; INIT_HLIST_NODE(&inst->hlist); - inst->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&inst->lock); /* needs to be two, since we _put() after creation */ atomic_set(&inst->use, 2); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f28460b61e47..18ed9c5d209c 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -148,7 +148,7 @@ instance_create(u_int16_t queue_num, int pid) atomic_set(&inst->id_sequence, 0); /* needs to be two, since we _put() after creation */ atomic_set(&inst->use, 2); - inst->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); if (!try_module_get(THIS_MODULE)) @@ -345,6 +345,10 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, struct nfqnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; + struct nf_info *entinf = entry->info; + struct sk_buff *entskb = entry->skb; + struct net_device *indev; + struct net_device *outdev; unsigned int tmp_uint; QDEBUG("entered\n"); @@ -361,6 +365,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hw)) + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_timestamp)); + outdev = entinf->outdev; + spin_lock_bh(&queue->lock); switch (queue->copy_mode) { @@ -370,15 +376,15 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, break; case NFQNL_COPY_PACKET: - if (entry->skb->ip_summed == CHECKSUM_HW && - (*errp = skb_checksum_help(entry->skb, - entry->info->outdev == NULL))) { + if (entskb->ip_summed == CHECKSUM_HW && + (*errp = skb_checksum_help(entskb, + outdev == NULL))) { spin_unlock_bh(&queue->lock); return NULL; } if (queue->copy_range == 0 - || queue->copy_range > entry->skb->len) - data_len = entry->skb->len; + || queue->copy_range > entskb->len) + data_len = entskb->len; else data_len = queue->copy_range; @@ -402,29 +408,30 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, sizeof(struct nfgenmsg)); nfmsg = NLMSG_DATA(nlh); - nfmsg->nfgen_family = entry->info->pf; + nfmsg->nfgen_family = entinf->pf; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(queue->queue_num); pmsg.packet_id = htonl(entry->id); - pmsg.hw_protocol = htons(entry->skb->protocol); - pmsg.hook = entry->info->hook; + pmsg.hw_protocol = htons(entskb->protocol); + pmsg.hook = entinf->hook; NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg); - if (entry->info->indev) { - tmp_uint = htonl(entry->info->indev->ifindex); + indev = entinf->indev; + if (indev) { + tmp_uint = htonl(indev->ifindex); #ifndef CONFIG_BRIDGE_NETFILTER NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); #else - if (entry->info->pf == PF_BRIDGE) { + if (entinf->pf == PF_BRIDGE) { /* Case 1: indev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint), &tmp_uint); /* this is the bridge group "brX" */ - tmp_uint = htonl(entry->info->indev->br_port->br->dev->ifindex); + tmp_uint = htonl(indev->br_port->br->dev->ifindex); NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); } else { @@ -432,9 +439,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, * physical device (when called from ipv4) */ NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); - if (entry->skb->nf_bridge - && entry->skb->nf_bridge->physindev) { - tmp_uint = htonl(entry->skb->nf_bridge->physindev->ifindex); + if (entskb->nf_bridge + && entskb->nf_bridge->physindev) { + tmp_uint = htonl(entskb->nf_bridge->physindev->ifindex); NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint), &tmp_uint); } @@ -442,19 +449,19 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, #endif } - if (entry->info->outdev) { - tmp_uint = htonl(entry->info->outdev->ifindex); + if (outdev) { + tmp_uint = htonl(outdev->ifindex); #ifndef CONFIG_BRIDGE_NETFILTER NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); #else - if (entry->info->pf == PF_BRIDGE) { + if (entinf->pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint), &tmp_uint); /* this is the bridge group "brX" */ - tmp_uint = htonl(entry->info->outdev->br_port->br->dev->ifindex); + tmp_uint = htonl(outdev->br_port->br->dev->ifindex); NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); } else { @@ -462,9 +469,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, * physical output device (when called from ipv4) */ NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); - if (entry->skb->nf_bridge - && entry->skb->nf_bridge->physoutdev) { - tmp_uint = htonl(entry->skb->nf_bridge->physoutdev->ifindex); + if (entskb->nf_bridge + && entskb->nf_bridge->physoutdev) { + tmp_uint = htonl(entskb->nf_bridge->physoutdev->ifindex); NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint), &tmp_uint); } @@ -472,27 +479,27 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, #endif } - if (entry->skb->nfmark) { - tmp_uint = htonl(entry->skb->nfmark); + if (entskb->nfmark) { + tmp_uint = htonl(entskb->nfmark); NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint); } - if (entry->info->indev && entry->skb->dev - && entry->skb->dev->hard_header_parse) { + if (indev && entskb->dev + && entskb->dev->hard_header_parse) { struct nfqnl_msg_packet_hw phw; phw.hw_addrlen = - entry->skb->dev->hard_header_parse(entry->skb, + entskb->dev->hard_header_parse(entskb, phw.hw_addr); phw.hw_addrlen = htons(phw.hw_addrlen); NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); } - if (entry->skb->tstamp.off_sec) { + if (entskb->tstamp.off_sec) { struct nfqnl_msg_packet_timestamp ts; - ts.sec = cpu_to_be64(entry->skb->tstamp.off_sec); - ts.usec = cpu_to_be64(entry->skb->tstamp.off_usec); + ts.sec = cpu_to_be64(entskb->tstamp.off_sec); + ts.usec = cpu_to_be64(entskb->tstamp.off_usec); NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); } @@ -510,7 +517,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, nfa->nfa_type = NFQA_PAYLOAD; nfa->nfa_len = size; - if (skb_copy_bits(entry->skb, 0, NFA_DATA(nfa), data_len)) + if (skb_copy_bits(entskb, 0, NFA_DATA(nfa), data_len)) BUG(); } @@ -667,12 +674,14 @@ nfqnl_set_mode(struct nfqnl_instance *queue, static int dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex) { - if (entry->info->indev) - if (entry->info->indev->ifindex == ifindex) + struct nf_info *entinf = entry->info; + + if (entinf->indev) + if (entinf->indev->ifindex == ifindex) return 1; - if (entry->info->outdev) - if (entry->info->outdev->ifindex == ifindex) + if (entinf->outdev) + if (entinf->outdev->ifindex == ifindex) return 1; return 0; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c new file mode 100644 index 000000000000..d7817afc6b96 --- /dev/null +++ b/net/netfilter/x_tables.c @@ -0,0 +1,624 @@ +/* + * x_tables core - Backend for {ip,ip6,arp}_tables + * + * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org> + * + * Based on existing ip_tables code which is + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/socket.h> +#include <linux/net.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/string.h> +#include <linux/vmalloc.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_arp.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("[ip,ip6,arp]_tables backend module"); + +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +struct xt_af { + struct semaphore mutex; + struct list_head match; + struct list_head target; + struct list_head tables; +}; + +static struct xt_af *xt; + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +enum { + TABLE, + TARGET, + MATCH, +}; + +/* Registration hooks for targets. */ +int +xt_register_target(int af, struct xt_target *target) +{ + int ret; + + ret = down_interruptible(&xt[af].mutex); + if (ret != 0) + return ret; + list_add(&target->list, &xt[af].target); + up(&xt[af].mutex); + return ret; +} +EXPORT_SYMBOL(xt_register_target); + +void +xt_unregister_target(int af, struct xt_target *target) +{ + down(&xt[af].mutex); + LIST_DELETE(&xt[af].target, target); + up(&xt[af].mutex); +} +EXPORT_SYMBOL(xt_unregister_target); + +int +xt_register_match(int af, struct xt_match *match) +{ + int ret; + + ret = down_interruptible(&xt[af].mutex); + if (ret != 0) + return ret; + + list_add(&match->list, &xt[af].match); + up(&xt[af].mutex); + + return ret; +} +EXPORT_SYMBOL(xt_register_match); + +void +xt_unregister_match(int af, struct xt_match *match) +{ + down(&xt[af].mutex); + LIST_DELETE(&xt[af].match, match); + up(&xt[af].mutex); +} +EXPORT_SYMBOL(xt_unregister_match); + + +/* + * These are weird, but module loading must not be done with mutex + * held (since they will register), and we have to have a single + * function to use try_then_request_module(). + */ + +/* Find match, grabs ref. Returns ERR_PTR() on error. */ +struct xt_match *xt_find_match(int af, const char *name, u8 revision) +{ + struct xt_match *m; + int err = 0; + + if (down_interruptible(&xt[af].mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(m, &xt[af].match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision == revision) { + if (try_module_get(m->me)) { + up(&xt[af].mutex); + return m; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + up(&xt[af].mutex); + return ERR_PTR(err); +} +EXPORT_SYMBOL(xt_find_match); + +/* Find target, grabs ref. Returns ERR_PTR() on error. */ +struct xt_target *xt_find_target(int af, const char *name, u8 revision) +{ + struct xt_target *t; + int err = 0; + + if (down_interruptible(&xt[af].mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &xt[af].target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision == revision) { + if (try_module_get(t->me)) { + up(&xt[af].mutex); + return t; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + up(&xt[af].mutex); + return ERR_PTR(err); +} +EXPORT_SYMBOL(xt_find_target); + +static const char *xt_prefix[NPROTO] = { + [AF_INET] = "ipt_%s", + [AF_INET6] = "ip6t_%s", + [NF_ARP] = "arpt_%s", +}; + +struct xt_target *xt_request_find_target(int af, const char *name, u8 revision) +{ + struct xt_target *target; + + target = try_then_request_module(xt_find_target(af, name, revision), + xt_prefix[af], name); + if (IS_ERR(target) || !target) + return NULL; + return target; +} +EXPORT_SYMBOL_GPL(xt_request_find_target); + +static int match_revfn(int af, const char *name, u8 revision, int *bestp) +{ + struct xt_match *m; + int have_rev = 0; + + list_for_each_entry(m, &xt[af].match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision > *bestp) + *bestp = m->revision; + if (m->revision == revision) + have_rev = 1; + } + } + return have_rev; +} + +static int target_revfn(int af, const char *name, u8 revision, int *bestp) +{ + struct xt_target *t; + int have_rev = 0; + + list_for_each_entry(t, &xt[af].target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision > *bestp) + *bestp = t->revision; + if (t->revision == revision) + have_rev = 1; + } + } + return have_rev; +} + +/* Returns true or false (if no such extension at all) */ +int xt_find_revision(int af, const char *name, u8 revision, int target, + int *err) +{ + int have_rev, best = -1; + + if (down_interruptible(&xt[af].mutex) != 0) { + *err = -EINTR; + return 1; + } + if (target == 1) + have_rev = target_revfn(af, name, revision, &best); + else + have_rev = match_revfn(af, name, revision, &best); + up(&xt[af].mutex); + + /* Nothing at all? Return 0 to try loading module. */ + if (best == -1) { + *err = -ENOENT; + return 0; + } + + *err = best; + if (!have_rev) + *err = -EPROTONOSUPPORT; + return 1; +} +EXPORT_SYMBOL_GPL(xt_find_revision); + +struct xt_table_info *xt_alloc_table_info(unsigned int size) +{ + struct xt_table_info *newinfo; + int cpu; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) + return NULL; + + newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + + if (newinfo->entries[cpu] == NULL) { + xt_free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} +EXPORT_SYMBOL(xt_alloc_table_info); + +void xt_free_table_info(struct xt_table_info *info) +{ + int cpu; + + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} +EXPORT_SYMBOL(xt_free_table_info); + +/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ +struct xt_table *xt_find_table_lock(int af, const char *name) +{ + struct xt_table *t; + + if (down_interruptible(&xt[af].mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &xt[af].tables, list) + if (strcmp(t->name, name) == 0 && try_module_get(t->me)) + return t; + up(&xt[af].mutex); + return NULL; +} +EXPORT_SYMBOL_GPL(xt_find_table_lock); + +void xt_table_unlock(struct xt_table *table) +{ + up(&xt[table->af].mutex); +} +EXPORT_SYMBOL_GPL(xt_table_unlock); + + +struct xt_table_info * +xt_replace_table(struct xt_table *table, + unsigned int num_counters, + struct xt_table_info *newinfo, + int *error) +{ + struct xt_table_info *oldinfo, *private; + + /* Do the substitution. */ + write_lock_bh(&table->lock); + private = table->private; + /* Check inside lock: is the old number correct? */ + if (num_counters != private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = private; + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); + + return oldinfo; +} +EXPORT_SYMBOL_GPL(xt_replace_table); + +int xt_register_table(struct xt_table *table, + struct xt_table_info *bootstrap, + struct xt_table_info *newinfo) +{ + int ret; + struct xt_table_info *private; + + ret = down_interruptible(&xt[table->af].mutex); + if (ret != 0) + return ret; + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&xt[table->af].tables, table->name)) { + ret = -EEXIST; + goto unlock; + } + + /* Simplifies replace_table code. */ + table->private = bootstrap; + if (!xt_replace_table(table, 0, newinfo, &ret)) + goto unlock; + + private = table->private; + duprintf("table->private->number = %u\n", private->number); + + /* save number of initial entries */ + private->initial_entries = private->number; + + rwlock_init(&table->lock); + list_prepend(&xt[table->af].tables, table); + + ret = 0; + unlock: + up(&xt[table->af].mutex); + return ret; +} +EXPORT_SYMBOL_GPL(xt_register_table); + +void *xt_unregister_table(struct xt_table *table) +{ + struct xt_table_info *private; + + down(&xt[table->af].mutex); + private = table->private; + LIST_DELETE(&xt[table->af].tables, table); + up(&xt[table->af].mutex); + + return private; +} +EXPORT_SYMBOL_GPL(xt_unregister_table); + +#ifdef CONFIG_PROC_FS +static char *xt_proto_prefix[NPROTO] = { + [AF_INET] = "ip", + [AF_INET6] = "ip6", + [NF_ARP] = "arp", +}; + +static struct list_head *xt_get_idx(struct list_head *list, struct seq_file *seq, loff_t pos) +{ + struct list_head *head = list->next; + + if (!head || list_empty(list)) + return NULL; + + while (pos && (head = head->next)) { + if (head == list) + return NULL; + pos--; + } + return pos ? NULL : head; +} + +static struct list_head *type2list(u_int16_t af, u_int16_t type) +{ + struct list_head *list; + + switch (type) { + case TARGET: + list = &xt[af].target; + break; + case MATCH: + list = &xt[af].match; + break; + case TABLE: + list = &xt[af].tables; + break; + default: + list = NULL; + break; + } + + return list; +} + +static void *xt_tgt_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct proc_dir_entry *pde = (struct proc_dir_entry *) seq->private; + u_int16_t af = (unsigned long)pde->data & 0xffff; + u_int16_t type = (unsigned long)pde->data >> 16; + struct list_head *list; + + if (af >= NPROTO) + return NULL; + + list = type2list(af, type); + if (!list) + return NULL; + + if (down_interruptible(&xt[af].mutex) != 0) + return NULL; + + return xt_get_idx(list, seq, *pos); +} + +static void *xt_tgt_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct proc_dir_entry *pde = seq->private; + u_int16_t af = (unsigned long)pde->data & 0xffff; + u_int16_t type = (unsigned long)pde->data >> 16; + struct list_head *list; + + if (af >= NPROTO) + return NULL; + + list = type2list(af, type); + if (!list) + return NULL; + + (*pos)++; + return xt_get_idx(list, seq, *pos); +} + +static void xt_tgt_seq_stop(struct seq_file *seq, void *v) +{ + struct proc_dir_entry *pde = seq->private; + u_int16_t af = (unsigned long)pde->data & 0xffff; + + up(&xt[af].mutex); +} + +static int xt_name_seq_show(struct seq_file *seq, void *v) +{ + char *name = (char *)v + sizeof(struct list_head); + + if (strlen(name)) + return seq_printf(seq, "%s\n", name); + else + return 0; +} + +static struct seq_operations xt_tgt_seq_ops = { + .start = xt_tgt_seq_start, + .next = xt_tgt_seq_next, + .stop = xt_tgt_seq_stop, + .show = xt_name_seq_show, +}; + +static int xt_tgt_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &xt_tgt_seq_ops); + if (!ret) { + struct seq_file *seq = file->private_data; + struct proc_dir_entry *pde = PDE(inode); + + seq->private = pde; + } + + return ret; +} + +static struct file_operations xt_file_ops = { + .owner = THIS_MODULE, + .open = xt_tgt_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#define FORMAT_TABLES "_tables_names" +#define FORMAT_MATCHES "_tables_matches" +#define FORMAT_TARGETS "_tables_targets" + +#endif /* CONFIG_PROC_FS */ + +int xt_proto_init(int af) +{ +#ifdef CONFIG_PROC_FS + char buf[XT_FUNCTION_MAXNAMELEN]; + struct proc_dir_entry *proc; +#endif + + if (af >= NPROTO) + return -EINVAL; + + +#ifdef CONFIG_PROC_FS + strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TABLES, sizeof(buf)); + proc = proc_net_fops_create(buf, 0440, &xt_file_ops); + if (!proc) + goto out; + proc->data = (void *) ((unsigned long) af | (TABLE << 16)); + + + strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_MATCHES, sizeof(buf)); + proc = proc_net_fops_create(buf, 0440, &xt_file_ops); + if (!proc) + goto out_remove_tables; + proc->data = (void *) ((unsigned long) af | (MATCH << 16)); + + strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TARGETS, sizeof(buf)); + proc = proc_net_fops_create(buf, 0440, &xt_file_ops); + if (!proc) + goto out_remove_matches; + proc->data = (void *) ((unsigned long) af | (TARGET << 16)); +#endif + + return 0; + +#ifdef CONFIG_PROC_FS +out_remove_matches: + strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_MATCHES, sizeof(buf)); + proc_net_remove(buf); + +out_remove_tables: + strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TABLES, sizeof(buf)); + proc_net_remove(buf); +out: + return -1; +#endif +} +EXPORT_SYMBOL_GPL(xt_proto_init); + +void xt_proto_fini(int af) +{ +#ifdef CONFIG_PROC_FS + char buf[XT_FUNCTION_MAXNAMELEN]; + + strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TABLES, sizeof(buf)); + proc_net_remove(buf); + + strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TARGETS, sizeof(buf)); + proc_net_remove(buf); + + strlcpy(buf, xt_proto_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_MATCHES, sizeof(buf)); + proc_net_remove(buf); +#endif /*CONFIG_PROC_FS*/ +} +EXPORT_SYMBOL_GPL(xt_proto_fini); + + +static int __init xt_init(void) +{ + int i; + + xt = kmalloc(sizeof(struct xt_af) * NPROTO, GFP_KERNEL); + if (!xt) + return -ENOMEM; + + for (i = 0; i < NPROTO; i++) { + init_MUTEX(&xt[i].mutex); + INIT_LIST_HEAD(&xt[i].target); + INIT_LIST_HEAD(&xt[i].match); + INIT_LIST_HEAD(&xt[i].tables); + } + return 0; +} + +static void __exit xt_fini(void) +{ + kfree(xt); +} + +module_init(xt_init); +module_exit(xt_fini); + diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index dab78d8bd494..78ee266a12ee 100644 --- a/net/ipv4/netfilter/ipt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -15,12 +15,13 @@ #include <linux/ip.h> #include <net/checksum.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_CLASSIFY.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_CLASSIFY.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("iptables qdisc classification target module"); +MODULE_ALIAS("ipt_CLASSIFY"); static unsigned int target(struct sk_buff **pskb, @@ -30,25 +31,25 @@ target(struct sk_buff **pskb, const void *targinfo, void *userinfo) { - const struct ipt_classify_target_info *clinfo = targinfo; + const struct xt_classify_target_info *clinfo = targinfo; - if((*pskb)->priority != clinfo->priority) + if ((*pskb)->priority != clinfo->priority) (*pskb)->priority = clinfo->priority; - return IPT_CONTINUE; + return XT_CONTINUE; } static int checkentry(const char *tablename, - const struct ipt_entry *e, + const void *e, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) { - if (targinfosize != IPT_ALIGN(sizeof(struct ipt_classify_target_info))){ + if (targinfosize != XT_ALIGN(sizeof(struct xt_classify_target_info))){ printk(KERN_ERR "CLASSIFY: invalid size (%u != %Zu).\n", targinfosize, - IPT_ALIGN(sizeof(struct ipt_classify_target_info))); + XT_ALIGN(sizeof(struct xt_classify_target_info))); return 0; } @@ -69,21 +70,39 @@ checkentry(const char *tablename, return 1; } -static struct ipt_target ipt_classify_reg = { +static struct xt_target classify_reg = { + .name = "CLASSIFY", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; +static struct xt_target classify6_reg = { .name = "CLASSIFY", .target = target, .checkentry = checkentry, .me = THIS_MODULE, }; + static int __init init(void) { - return ipt_register_target(&ipt_classify_reg); + int ret; + + ret = xt_register_target(AF_INET, &classify_reg); + if (ret) + return ret; + + ret = xt_register_target(AF_INET6, &classify6_reg); + if (ret) + xt_unregister_target(AF_INET, &classify_reg); + + return ret; } static void __exit fini(void) { - ipt_unregister_target(&ipt_classify_reg); + xt_unregister_target(AF_INET, &classify_reg); + xt_unregister_target(AF_INET6, &classify6_reg); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index 8acac5a40a92..22506e376be5 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -26,9 +26,10 @@ MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>"); MODULE_DESCRIPTION("IP tables CONNMARK matching module"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_CONNMARK"); -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_CONNMARK.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_CONNMARK.h> #include <net/netfilter/nf_conntrack_compat.h> static unsigned int @@ -39,7 +40,7 @@ target(struct sk_buff **pskb, const void *targinfo, void *userinfo) { - const struct ipt_connmark_target_info *markinfo = targinfo; + const struct xt_connmark_target_info *markinfo = targinfo; u_int32_t diff; u_int32_t nfmark; u_int32_t newmark; @@ -48,17 +49,17 @@ target(struct sk_buff **pskb, if (ctmark) { switch(markinfo->mode) { - case IPT_CONNMARK_SET: + case XT_CONNMARK_SET: newmark = (*ctmark & ~markinfo->mask) | markinfo->mark; if (newmark != *ctmark) *ctmark = newmark; break; - case IPT_CONNMARK_SAVE: + case XT_CONNMARK_SAVE: newmark = (*ctmark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); if (*ctmark != newmark) *ctmark = newmark; break; - case IPT_CONNMARK_RESTORE: + case XT_CONNMARK_RESTORE: nfmark = (*pskb)->nfmark; diff = (*ctmark ^ nfmark) & markinfo->mask; if (diff != 0) @@ -67,25 +68,25 @@ target(struct sk_buff **pskb, } } - return IPT_CONTINUE; + return XT_CONTINUE; } static int checkentry(const char *tablename, - const struct ipt_entry *e, + const void *entry, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) { - struct ipt_connmark_target_info *matchinfo = targinfo; - if (targinfosize != IPT_ALIGN(sizeof(struct ipt_connmark_target_info))) { + struct xt_connmark_target_info *matchinfo = targinfo; + if (targinfosize != XT_ALIGN(sizeof(struct xt_connmark_target_info))) { printk(KERN_WARNING "CONNMARK: targinfosize %u != %Zu\n", targinfosize, - IPT_ALIGN(sizeof(struct ipt_connmark_target_info))); + XT_ALIGN(sizeof(struct xt_connmark_target_info))); return 0; } - if (matchinfo->mode == IPT_CONNMARK_RESTORE) { + if (matchinfo->mode == XT_CONNMARK_RESTORE) { if (strcmp(tablename, "mangle") != 0) { printk(KERN_WARNING "CONNMARK: restore can only be called from \"mangle\" table, not \"%s\"\n", tablename); return 0; @@ -100,7 +101,13 @@ checkentry(const char *tablename, return 1; } -static struct ipt_target ipt_connmark_reg = { +static struct xt_target connmark_reg = { + .name = "CONNMARK", + .target = &target, + .checkentry = &checkentry, + .me = THIS_MODULE +}; +static struct xt_target connmark6_reg = { .name = "CONNMARK", .target = &target, .checkentry = &checkentry, @@ -109,13 +116,25 @@ static struct ipt_target ipt_connmark_reg = { static int __init init(void) { - need_ip_conntrack(); - return ipt_register_target(&ipt_connmark_reg); + int ret; + + need_conntrack(); + + ret = xt_register_target(AF_INET, &connmark_reg); + if (ret) + return ret; + + ret = xt_register_target(AF_INET6, &connmark6_reg); + if (ret) + xt_unregister_target(AF_INET, &connmark_reg); + + return ret; } static void __exit fini(void) { - ipt_unregister_target(&ipt_connmark_reg); + xt_unregister_target(AF_INET, &connmark_reg); + xt_unregister_target(AF_INET6, &connmark6_reg); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/netfilter/xt_MARK.c index 52b4f2c296bf..0c11ee9550f3 100644 --- a/net/ipv4/netfilter/ipt_MARK.c +++ b/net/netfilter/xt_MARK.c @@ -12,12 +12,14 @@ #include <linux/ip.h> #include <net/checksum.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_MARK.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_MARK.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("iptables MARK modification module"); +MODULE_DESCRIPTION("ip[6]tables MARK modification module"); +MODULE_ALIAS("ipt_MARK"); +MODULE_ALIAS("ip6t_MARK"); static unsigned int target_v0(struct sk_buff **pskb, @@ -27,12 +29,12 @@ target_v0(struct sk_buff **pskb, const void *targinfo, void *userinfo) { - const struct ipt_mark_target_info *markinfo = targinfo; + const struct xt_mark_target_info *markinfo = targinfo; if((*pskb)->nfmark != markinfo->mark) (*pskb)->nfmark = markinfo->mark; - return IPT_CONTINUE; + return XT_CONTINUE; } static unsigned int @@ -43,19 +45,19 @@ target_v1(struct sk_buff **pskb, const void *targinfo, void *userinfo) { - const struct ipt_mark_target_info_v1 *markinfo = targinfo; + const struct xt_mark_target_info_v1 *markinfo = targinfo; int mark = 0; switch (markinfo->mode) { - case IPT_MARK_SET: + case XT_MARK_SET: mark = markinfo->mark; break; - case IPT_MARK_AND: + case XT_MARK_AND: mark = (*pskb)->nfmark & markinfo->mark; break; - case IPT_MARK_OR: + case XT_MARK_OR: mark = (*pskb)->nfmark | markinfo->mark; break; } @@ -63,23 +65,23 @@ target_v1(struct sk_buff **pskb, if((*pskb)->nfmark != mark) (*pskb)->nfmark = mark; - return IPT_CONTINUE; + return XT_CONTINUE; } static int checkentry_v0(const char *tablename, - const struct ipt_entry *e, + const void *entry, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) { - struct ipt_mark_target_info *markinfo = targinfo; + struct xt_mark_target_info *markinfo = targinfo; - if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { + if (targinfosize != XT_ALIGN(sizeof(struct xt_mark_target_info))) { printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", targinfosize, - IPT_ALIGN(sizeof(struct ipt_mark_target_info))); + XT_ALIGN(sizeof(struct xt_mark_target_info))); return 0; } @@ -98,17 +100,17 @@ checkentry_v0(const char *tablename, static int checkentry_v1(const char *tablename, - const struct ipt_entry *e, + const void *entry, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) { - struct ipt_mark_target_info_v1 *markinfo = targinfo; + struct xt_mark_target_info_v1 *markinfo = targinfo; - if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))){ + if (targinfosize != XT_ALIGN(sizeof(struct xt_mark_target_info_v1))){ printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", targinfosize, - IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))); + XT_ALIGN(sizeof(struct xt_mark_target_info_v1))); return 0; } @@ -117,9 +119,9 @@ checkentry_v1(const char *tablename, return 0; } - if (markinfo->mode != IPT_MARK_SET - && markinfo->mode != IPT_MARK_AND - && markinfo->mode != IPT_MARK_OR) { + if (markinfo->mode != XT_MARK_SET + && markinfo->mode != XT_MARK_AND + && markinfo->mode != XT_MARK_OR) { printk(KERN_WARNING "MARK: unknown mode %u\n", markinfo->mode); return 0; @@ -133,7 +135,7 @@ checkentry_v1(const char *tablename, return 1; } -static struct ipt_target ipt_mark_reg_v0 = { +static struct xt_target ipt_mark_reg_v0 = { .name = "MARK", .target = target_v0, .checkentry = checkentry_v0, @@ -141,7 +143,7 @@ static struct ipt_target ipt_mark_reg_v0 = { .revision = 0, }; -static struct ipt_target ipt_mark_reg_v1 = { +static struct xt_target ipt_mark_reg_v1 = { .name = "MARK", .target = target_v1, .checkentry = checkentry_v1, @@ -149,23 +151,40 @@ static struct ipt_target ipt_mark_reg_v1 = { .revision = 1, }; +static struct xt_target ip6t_mark_reg_v0 = { + .name = "MARK", + .target = target_v0, + .checkentry = checkentry_v0, + .me = THIS_MODULE, + .revision = 0, +}; + static int __init init(void) { int err; - err = ipt_register_target(&ipt_mark_reg_v0); - if (!err) { - err = ipt_register_target(&ipt_mark_reg_v1); - if (err) - ipt_unregister_target(&ipt_mark_reg_v0); + err = xt_register_target(AF_INET, &ipt_mark_reg_v0); + if (err) + return err; + + err = xt_register_target(AF_INET, &ipt_mark_reg_v1); + if (err) + xt_unregister_target(AF_INET, &ipt_mark_reg_v0); + + err = xt_register_target(AF_INET6, &ip6t_mark_reg_v0); + if (err) { + xt_unregister_target(AF_INET, &ipt_mark_reg_v0); + xt_unregister_target(AF_INET, &ipt_mark_reg_v1); } + return err; } static void __exit fini(void) { - ipt_unregister_target(&ipt_mark_reg_v0); - ipt_unregister_target(&ipt_mark_reg_v1); + xt_unregister_target(AF_INET, &ipt_mark_reg_v0); + xt_unregister_target(AF_INET, &ipt_mark_reg_v1); + xt_unregister_target(AF_INET6, &ip6t_mark_reg_v0); } module_init(init); diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c new file mode 100644 index 000000000000..8b76b6f8d1e4 --- /dev/null +++ b/net/netfilter/xt_NFQUEUE.c @@ -0,0 +1,107 @@ +/* iptables module for using new netfilter netlink queue + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_arp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_NFQUEUE.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("[ip,ip6,arp]_tables NFQUEUE target"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_NFQUEUE"); +MODULE_ALIAS("ip6t_NFQUEUE"); +MODULE_ALIAS("arpt_NFQUEUE"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct xt_NFQ_info *tinfo = targinfo; + + return NF_QUEUE_NR(tinfo->queuenum); +} + +static int +checkentry(const char *tablename, + const void *entry, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != XT_ALIGN(sizeof(struct xt_NFQ_info))) { + printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n", + targinfosize, + XT_ALIGN(sizeof(struct xt_NFQ_info))); + return 0; + } + + return 1; +} + +static struct xt_target ipt_NFQ_reg = { + .name = "NFQUEUE", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static struct xt_target ip6t_NFQ_reg = { + .name = "NFQUEUE", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static struct xt_target arpt_NFQ_reg = { + .name = "NFQUEUE", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int ret; + ret = xt_register_target(AF_INET, &ipt_NFQ_reg); + if (ret) + return ret; + ret = xt_register_target(AF_INET6, &ip6t_NFQ_reg); + if (ret) + goto out_ip; + ret = xt_register_target(NF_ARP, &arpt_NFQ_reg); + if (ret) + goto out_ip6; + + return ret; +out_ip6: + xt_unregister_target(AF_INET6, &ip6t_NFQ_reg); +out_ip: + xt_unregister_target(AF_INET, &ipt_NFQ_reg); + + return ret; +} + +static void __exit fini(void) +{ + xt_unregister_target(NF_ARP, &arpt_NFQ_reg); + xt_unregister_target(AF_INET6, &ip6t_NFQ_reg); + xt_unregister_target(AF_INET, &ipt_NFQ_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c index e3c69d072c6e..24d477afa939 100644 --- a/net/ipv4/netfilter/ipt_NOTRACK.c +++ b/net/netfilter/xt_NOTRACK.c @@ -4,9 +4,12 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_conntrack_compat.h> +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_NOTRACK"); + static unsigned int target(struct sk_buff **pskb, const struct net_device *in, @@ -17,7 +20,7 @@ target(struct sk_buff **pskb, { /* Previously seen (loopback)? Ignore. */ if ((*pskb)->nfct != NULL) - return IPT_CONTINUE; + return XT_CONTINUE; /* Attach fake conntrack entry. If there is a real ct entry correspondig to this packet, @@ -27,12 +30,12 @@ target(struct sk_buff **pskb, (*pskb)->nfctinfo = IP_CT_NEW; nf_conntrack_get((*pskb)->nfct); - return IPT_CONTINUE; + return XT_CONTINUE; } static int checkentry(const char *tablename, - const struct ipt_entry *e, + const void *entry, void *targinfo, unsigned int targinfosize, unsigned int hook_mask) @@ -51,26 +54,39 @@ checkentry(const char *tablename, return 1; } -static struct ipt_target ipt_notrack_reg = { +static struct xt_target notrack_reg = { .name = "NOTRACK", .target = target, .checkentry = checkentry, - .me = THIS_MODULE + .me = THIS_MODULE, +}; +static struct xt_target notrack6_reg = { + .name = "NOTRACK", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, }; static int __init init(void) { - if (ipt_register_target(&ipt_notrack_reg)) - return -EINVAL; + int ret; + + ret = xt_register_target(AF_INET, ¬rack_reg); + if (ret) + return ret; - return 0; + ret = xt_register_target(AF_INET6, ¬rack6_reg); + if (ret) + xt_unregister_target(AF_INET, ¬rack_reg); + + return ret; } static void __exit fini(void) { - ipt_unregister_target(&ipt_notrack_reg); + xt_unregister_target(AF_INET6, ¬rack6_reg); + xt_unregister_target(AF_INET, ¬rack_reg); } module_init(init); module_exit(fini); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ipt_comment.c b/net/netfilter/xt_comment.c index 6b76a1ea5245..4ba6fd65c6e9 100644 --- a/net/ipv4/netfilter/ipt_comment.c +++ b/net/netfilter/xt_comment.c @@ -6,12 +6,14 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_comment.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_comment.h> MODULE_AUTHOR("Brad Fisher <brad@info-link.net>"); MODULE_DESCRIPTION("iptables comment match module"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_comment"); +MODULE_ALIAS("ip6t_comment"); static int match(const struct sk_buff *skb, @@ -19,6 +21,7 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protooff, int *hotdrop) { /* We always match */ @@ -27,18 +30,25 @@ match(const struct sk_buff *skb, static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { /* Check the size */ - if (matchsize != IPT_ALIGN(sizeof(struct ipt_comment_info))) + if (matchsize != XT_ALIGN(sizeof(struct xt_comment_info))) return 0; return 1; } -static struct ipt_match comment_match = { +static struct xt_match comment_match = { + .name = "comment", + .match = match, + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static struct xt_match comment6_match = { .name = "comment", .match = match, .checkentry = checkentry, @@ -47,12 +57,23 @@ static struct ipt_match comment_match = { static int __init init(void) { - return ipt_register_match(&comment_match); + int ret; + + ret = xt_register_match(AF_INET, &comment_match); + if (ret) + return ret; + + ret = xt_register_match(AF_INET6, &comment6_match); + if (ret) + xt_unregister_match(AF_INET, &comment_match); + + return ret; } static void __exit fini(void) { - ipt_unregister_match(&comment_match); + xt_unregister_match(AF_INET, &comment_match); + xt_unregister_match(AF_INET6, &comment6_match); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/netfilter/xt_connbytes.c index d68a048b7176..150d2a4b0f71 100644 --- a/net/ipv4/netfilter/ipt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -6,13 +6,15 @@ * - add functionality to match number of packets * - add functionality to match average packet size * - add support to match directions seperately + * 2005-10-16 Harald Welte <laforge@netfilter.org> + * - Port to x_tables * */ #include <linux/module.h> #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack_compat.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_connbytes.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_connbytes.h> #include <asm/div64.h> #include <asm/bitops.h> @@ -20,6 +22,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); +MODULE_ALIAS("ipt_connbytes"); /* 64bit divisor, dividend and result. dynamic precision */ static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) @@ -43,9 +46,10 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_connbytes_info *sinfo = matchinfo; + const struct xt_connbytes_info *sinfo = matchinfo; u_int64_t what = 0; /* initialize to make gcc happy */ const struct ip_conntrack_counter *counters; @@ -53,45 +57,45 @@ match(const struct sk_buff *skb, return 0; /* no match */ switch (sinfo->what) { - case IPT_CONNBYTES_PKTS: + case XT_CONNBYTES_PKTS: switch (sinfo->direction) { - case IPT_CONNBYTES_DIR_ORIGINAL: + case XT_CONNBYTES_DIR_ORIGINAL: what = counters[IP_CT_DIR_ORIGINAL].packets; break; - case IPT_CONNBYTES_DIR_REPLY: + case XT_CONNBYTES_DIR_REPLY: what = counters[IP_CT_DIR_REPLY].packets; break; - case IPT_CONNBYTES_DIR_BOTH: + case XT_CONNBYTES_DIR_BOTH: what = counters[IP_CT_DIR_ORIGINAL].packets; what += counters[IP_CT_DIR_REPLY].packets; break; } break; - case IPT_CONNBYTES_BYTES: + case XT_CONNBYTES_BYTES: switch (sinfo->direction) { - case IPT_CONNBYTES_DIR_ORIGINAL: + case XT_CONNBYTES_DIR_ORIGINAL: what = counters[IP_CT_DIR_ORIGINAL].bytes; break; - case IPT_CONNBYTES_DIR_REPLY: + case XT_CONNBYTES_DIR_REPLY: what = counters[IP_CT_DIR_REPLY].bytes; break; - case IPT_CONNBYTES_DIR_BOTH: + case XT_CONNBYTES_DIR_BOTH: what = counters[IP_CT_DIR_ORIGINAL].bytes; what += counters[IP_CT_DIR_REPLY].bytes; break; } break; - case IPT_CONNBYTES_AVGPKT: + case XT_CONNBYTES_AVGPKT: switch (sinfo->direction) { - case IPT_CONNBYTES_DIR_ORIGINAL: + case XT_CONNBYTES_DIR_ORIGINAL: what = div64_64(counters[IP_CT_DIR_ORIGINAL].bytes, counters[IP_CT_DIR_ORIGINAL].packets); break; - case IPT_CONNBYTES_DIR_REPLY: + case XT_CONNBYTES_DIR_REPLY: what = div64_64(counters[IP_CT_DIR_REPLY].bytes, counters[IP_CT_DIR_REPLY].packets); break; - case IPT_CONNBYTES_DIR_BOTH: + case XT_CONNBYTES_DIR_BOTH: { u_int64_t bytes; u_int64_t pkts; @@ -117,30 +121,36 @@ match(const struct sk_buff *skb, } static int check(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - const struct ipt_connbytes_info *sinfo = matchinfo; + const struct xt_connbytes_info *sinfo = matchinfo; - if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info))) + if (matchsize != XT_ALIGN(sizeof(struct xt_connbytes_info))) return 0; - if (sinfo->what != IPT_CONNBYTES_PKTS && - sinfo->what != IPT_CONNBYTES_BYTES && - sinfo->what != IPT_CONNBYTES_AVGPKT) + if (sinfo->what != XT_CONNBYTES_PKTS && + sinfo->what != XT_CONNBYTES_BYTES && + sinfo->what != XT_CONNBYTES_AVGPKT) return 0; - if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL && - sinfo->direction != IPT_CONNBYTES_DIR_REPLY && - sinfo->direction != IPT_CONNBYTES_DIR_BOTH) + if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL && + sinfo->direction != XT_CONNBYTES_DIR_REPLY && + sinfo->direction != XT_CONNBYTES_DIR_BOTH) return 0; return 1; } -static struct ipt_match state_match = { +static struct xt_match connbytes_match = { + .name = "connbytes", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE +}; +static struct xt_match connbytes6_match = { .name = "connbytes", .match = &match, .checkentry = &check, @@ -149,12 +159,21 @@ static struct ipt_match state_match = { static int __init init(void) { - return ipt_register_match(&state_match); + int ret; + ret = xt_register_match(AF_INET, &connbytes_match); + if (ret) + return ret; + + ret = xt_register_match(AF_INET6, &connbytes6_match); + if (ret) + xt_unregister_match(AF_INET, &connbytes_match); + return ret; } static void __exit fini(void) { - ipt_unregister_match(&state_match); + xt_unregister_match(AF_INET, &connbytes_match); + xt_unregister_match(AF_INET6, &connbytes6_match); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/netfilter/xt_connmark.c index 5306ef293b92..d06e925032da 100644 --- a/net/ipv4/netfilter/ipt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -25,9 +25,10 @@ MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>"); MODULE_DESCRIPTION("IP tables connmark match module"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_connmark"); -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_connmark.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_connmark.h> #include <net/netfilter/nf_conntrack_compat.h> static int @@ -36,9 +37,10 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_connmark_info *info = matchinfo; + const struct xt_connmark_info *info = matchinfo; u_int32_t ctinfo; const u_int32_t *ctmark = nf_ct_get_mark(skb, &ctinfo); if (!ctmark) @@ -49,14 +51,14 @@ match(const struct sk_buff *skb, static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - struct ipt_connmark_info *cm = - (struct ipt_connmark_info *)matchinfo; - if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) + struct xt_connmark_info *cm = + (struct xt_connmark_info *)matchinfo; + if (matchsize != XT_ALIGN(sizeof(struct xt_connmark_info))) return 0; if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { @@ -67,21 +69,40 @@ checkentry(const char *tablename, return 1; } -static struct ipt_match connmark_match = { +static struct xt_match connmark_match = { + .name = "connmark", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE +}; +static struct xt_match connmark6_match = { .name = "connmark", .match = &match, .checkentry = &checkentry, .me = THIS_MODULE }; + static int __init init(void) { - return ipt_register_match(&connmark_match); + int ret; + + need_conntrack(); + + ret = xt_register_match(AF_INET, &connmark_match); + if (ret) + return ret; + + ret = xt_register_match(AF_INET6, &connmark6_match); + if (ret) + xt_unregister_match(AF_INET, &connmark_match); + return ret; } static void __exit fini(void) { - ipt_unregister_match(&connmark_match); + xt_unregister_match(AF_INET6, &connmark6_match); + xt_unregister_match(AF_INET, &connmark_match); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/netfilter/xt_conntrack.c index c8d18705469b..ffdebc95eb95 100644 --- a/net/ipv4/netfilter/ipt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -18,12 +18,13 @@ #include <net/netfilter/nf_conntrack.h> #endif -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_conntrack.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_conntrack.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("iptables connection tracking match module"); +MODULE_ALIAS("ipt_conntrack"); #if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) @@ -33,9 +34,10 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_conntrack_info *sinfo = matchinfo; + const struct xt_conntrack_info *sinfo = matchinfo; struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; unsigned int statebit; @@ -45,58 +47,58 @@ match(const struct sk_buff *skb, #define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) if (ct == &ip_conntrack_untracked) - statebit = IPT_CONNTRACK_STATE_UNTRACKED; + statebit = XT_CONNTRACK_STATE_UNTRACKED; else if (ct) - statebit = IPT_CONNTRACK_STATE_BIT(ctinfo); + statebit = XT_CONNTRACK_STATE_BIT(ctinfo); else - statebit = IPT_CONNTRACK_STATE_INVALID; + statebit = XT_CONNTRACK_STATE_INVALID; - if(sinfo->flags & IPT_CONNTRACK_STATE) { + if(sinfo->flags & XT_CONNTRACK_STATE) { if (ct) { if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip != ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip) - statebit |= IPT_CONNTRACK_STATE_SNAT; + statebit |= XT_CONNTRACK_STATE_SNAT; if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip != ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip) - statebit |= IPT_CONNTRACK_STATE_DNAT; + statebit |= XT_CONNTRACK_STATE_DNAT; } - if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE)) + if (FWINV((statebit & sinfo->statemask) == 0, XT_CONNTRACK_STATE)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_PROTO) { - if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO)) + if(sinfo->flags & XT_CONNTRACK_PROTO) { + if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, XT_CONNTRACK_PROTO)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) { - if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC)) + if(sinfo->flags & XT_CONNTRACK_ORIGSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, XT_CONNTRACK_ORIGSRC)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_ORIGDST) { - if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST)) + if(sinfo->flags & XT_CONNTRACK_ORIGDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, XT_CONNTRACK_ORIGDST)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_REPLSRC) { - if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC)) + if(sinfo->flags & XT_CONNTRACK_REPLSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, XT_CONNTRACK_REPLSRC)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_REPLDST) { - if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST)) + if(sinfo->flags & XT_CONNTRACK_REPLDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, XT_CONNTRACK_REPLDST)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_STATUS) { - if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS)) + if(sinfo->flags & XT_CONNTRACK_STATUS) { + if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, XT_CONNTRACK_STATUS)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_EXPIRES) { + if(sinfo->flags & XT_CONNTRACK_EXPIRES) { unsigned long expires; if(!ct) @@ -104,7 +106,7 @@ match(const struct sk_buff *skb, expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0; - if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES)) + if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), XT_CONNTRACK_EXPIRES)) return 0; } @@ -118,9 +120,10 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_conntrack_info *sinfo = matchinfo; + const struct xt_conntrack_info *sinfo = matchinfo; struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int statebit; @@ -130,58 +133,58 @@ match(const struct sk_buff *skb, #define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) if (ct == &nf_conntrack_untracked) - statebit = IPT_CONNTRACK_STATE_UNTRACKED; + statebit = XT_CONNTRACK_STATE_UNTRACKED; else if (ct) - statebit = IPT_CONNTRACK_STATE_BIT(ctinfo); + statebit = XT_CONNTRACK_STATE_BIT(ctinfo); else - statebit = IPT_CONNTRACK_STATE_INVALID; + statebit = XT_CONNTRACK_STATE_INVALID; - if(sinfo->flags & IPT_CONNTRACK_STATE) { + if(sinfo->flags & XT_CONNTRACK_STATE) { if (ct) { if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip != ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip) - statebit |= IPT_CONNTRACK_STATE_SNAT; + statebit |= XT_CONNTRACK_STATE_SNAT; if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip != ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip) - statebit |= IPT_CONNTRACK_STATE_DNAT; + statebit |= XT_CONNTRACK_STATE_DNAT; } - if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE)) + if (FWINV((statebit & sinfo->statemask) == 0, XT_CONNTRACK_STATE)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_PROTO) { - if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO)) + if(sinfo->flags & XT_CONNTRACK_PROTO) { + if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, XT_CONNTRACK_PROTO)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) { - if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC)) + if(sinfo->flags & XT_CONNTRACK_ORIGSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, XT_CONNTRACK_ORIGSRC)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_ORIGDST) { - if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST)) + if(sinfo->flags & XT_CONNTRACK_ORIGDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, XT_CONNTRACK_ORIGDST)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_REPLSRC) { - if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC)) + if(sinfo->flags & XT_CONNTRACK_REPLSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, XT_CONNTRACK_REPLSRC)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_REPLDST) { - if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST)) + if(sinfo->flags & XT_CONNTRACK_REPLDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, XT_CONNTRACK_REPLDST)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_STATUS) { - if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS)) + if(sinfo->flags & XT_CONNTRACK_STATUS) { + if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, XT_CONNTRACK_STATUS)) return 0; } - if(sinfo->flags & IPT_CONNTRACK_EXPIRES) { + if(sinfo->flags & XT_CONNTRACK_EXPIRES) { unsigned long expires; if(!ct) @@ -189,7 +192,7 @@ match(const struct sk_buff *skb, expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0; - if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES)) + if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), XT_CONNTRACK_EXPIRES)) return 0; } @@ -199,18 +202,18 @@ match(const struct sk_buff *skb, #endif /* CONFIG_NF_IP_CONNTRACK */ static int check(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - if (matchsize != IPT_ALIGN(sizeof(struct ipt_conntrack_info))) + if (matchsize != XT_ALIGN(sizeof(struct xt_conntrack_info))) return 0; return 1; } -static struct ipt_match conntrack_match = { +static struct xt_match conntrack_match = { .name = "conntrack", .match = &match, .checkentry = &check, @@ -219,13 +222,16 @@ static struct ipt_match conntrack_match = { static int __init init(void) { - need_ip_conntrack(); - return ipt_register_match(&conntrack_match); + int ret; + need_conntrack(); + ret = xt_register_match(AF_INET, &conntrack_match); + + return ret; } static void __exit fini(void) { - ipt_unregister_match(&conntrack_match); + xt_unregister_match(AF_INET, &conntrack_match); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/netfilter/xt_dccp.c index ad3278bba6c1..779f42fc9524 100644 --- a/net/ipv4/netfilter/ipt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -14,8 +14,16 @@ #include <net/ip.h> #include <linux/dccp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_dccp.h> + #include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_dccp.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("Match for DCCP protocol packets"); +MODULE_ALIAS("ipt_dccp"); #define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ || (!!((invflag) & (option)) ^ (cond))) @@ -26,6 +34,7 @@ static DEFINE_SPINLOCK(dccp_buflock); static inline int dccp_find_option(u_int8_t option, const struct sk_buff *skb, + unsigned int protoff, const struct dccp_hdr *dh, int *hotdrop) { @@ -44,9 +53,7 @@ dccp_find_option(u_int8_t option, return 0; spin_lock_bh(&dccp_buflock); - op = skb_header_pointer(skb, - skb->nh.iph->ihl*4 + optoff, - optlen, dccp_optbuf); + op = skb_header_pointer(skb, protoff + optoff, optlen, dccp_optbuf); if (op == NULL) { /* If we don't have the whole header, drop packet. */ spin_unlock_bh(&dccp_buflock); @@ -78,10 +85,10 @@ match_types(const struct dccp_hdr *dh, u_int16_t typemask) } static inline int -match_option(u_int8_t option, const struct sk_buff *skb, +match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, const struct dccp_hdr *dh, int *hotdrop) { - return dccp_find_option(option, skb, dh, hotdrop); + return dccp_find_option(option, skb, protoff, dh, hotdrop); } static int @@ -90,16 +97,17 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_dccp_info *info = - (const struct ipt_dccp_info *)matchinfo; + const struct xt_dccp_info *info = + (const struct xt_dccp_info *)matchinfo; struct dccp_hdr _dh, *dh; if (offset) return 0; - dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh); + dh = skb_header_pointer(skb, protoff, sizeof(_dh), &_dh); if (dh == NULL) { *hotdrop = 1; return 0; @@ -107,42 +115,73 @@ match(const struct sk_buff *skb, return DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0]) && (ntohs(dh->dccph_sport) <= info->spts[1])), - IPT_DCCP_SRC_PORTS, info->flags, info->invflags) + XT_DCCP_SRC_PORTS, info->flags, info->invflags) && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0]) && (ntohs(dh->dccph_dport) <= info->dpts[1])), - IPT_DCCP_DEST_PORTS, info->flags, info->invflags) + XT_DCCP_DEST_PORTS, info->flags, info->invflags) && DCCHECK(match_types(dh, info->typemask), - IPT_DCCP_TYPE, info->flags, info->invflags) - && DCCHECK(match_option(info->option, skb, dh, hotdrop), - IPT_DCCP_OPTION, info->flags, info->invflags); + XT_DCCP_TYPE, info->flags, info->invflags) + && DCCHECK(match_option(info->option, skb, protoff, dh, + hotdrop), + XT_DCCP_OPTION, info->flags, info->invflags); } static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *inf, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - const struct ipt_dccp_info *info; + const struct ipt_ip *ip = inf; + const struct xt_dccp_info *info; - info = (const struct ipt_dccp_info *)matchinfo; + info = (const struct xt_dccp_info *)matchinfo; return ip->proto == IPPROTO_DCCP - && !(ip->invflags & IPT_INV_PROTO) - && matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info)) - && !(info->flags & ~IPT_DCCP_VALID_FLAGS) - && !(info->invflags & ~IPT_DCCP_VALID_FLAGS) + && !(ip->invflags & XT_INV_PROTO) + && matchsize == XT_ALIGN(sizeof(struct xt_dccp_info)) + && !(info->flags & ~XT_DCCP_VALID_FLAGS) + && !(info->invflags & ~XT_DCCP_VALID_FLAGS) && !(info->invflags & ~info->flags); } -static struct ipt_match dccp_match = +static int +checkentry6(const char *tablename, + const void *inf, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_ip6 *ip = inf; + const struct xt_dccp_info *info; + + info = (const struct xt_dccp_info *)matchinfo; + + return ip->proto == IPPROTO_DCCP + && !(ip->invflags & XT_INV_PROTO) + && matchsize == XT_ALIGN(sizeof(struct xt_dccp_info)) + && !(info->flags & ~XT_DCCP_VALID_FLAGS) + && !(info->invflags & ~XT_DCCP_VALID_FLAGS) + && !(info->invflags & ~info->flags); +} + + +static struct xt_match dccp_match = { .name = "dccp", .match = &match, .checkentry = &checkentry, .me = THIS_MODULE, }; +static struct xt_match dccp6_match = +{ + .name = "dccp", + .match = &match, + .checkentry = &checkentry6, + .me = THIS_MODULE, +}; + static int __init init(void) { @@ -154,23 +193,29 @@ static int __init init(void) dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL); if (!dccp_optbuf) return -ENOMEM; - ret = ipt_register_match(&dccp_match); + ret = xt_register_match(AF_INET, &dccp_match); if (ret) - kfree(dccp_optbuf); + goto out_kfree; + ret = xt_register_match(AF_INET6, &dccp6_match); + if (ret) + goto out_unreg; + + return ret; + +out_unreg: + xt_unregister_match(AF_INET, &dccp_match); +out_kfree: + kfree(dccp_optbuf); return ret; } static void __exit fini(void) { - ipt_unregister_match(&dccp_match); + xt_unregister_match(AF_INET6, &dccp6_match); + xt_unregister_match(AF_INET, &dccp_match); kfree(dccp_optbuf); } module_init(init); module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("Match for DCCP protocol packets"); - diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/netfilter/xt_helper.c index bf14e1c7798a..38b6715e1db4 100644 --- a/net/ipv4/netfilter/ipt_helper.c +++ b/net/netfilter/xt_helper.c @@ -22,12 +22,14 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #endif -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_helper.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_helper.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); MODULE_DESCRIPTION("iptables helper match module"); +MODULE_ALIAS("ipt_helper"); +MODULE_ALIAS("ip6t_helper"); #if 0 #define DEBUGP printk @@ -42,27 +44,28 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_helper_info *info = matchinfo; + const struct xt_helper_info *info = matchinfo; struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; int ret = info->invert; ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); if (!ct) { - DEBUGP("ipt_helper: Eek! invalid conntrack?\n"); + DEBUGP("xt_helper: Eek! invalid conntrack?\n"); return ret; } if (!ct->master) { - DEBUGP("ipt_helper: conntrack %p has no master\n", ct); + DEBUGP("xt_helper: conntrack %p has no master\n", ct); return ret; } read_lock_bh(&ip_conntrack_lock); if (!ct->master->helper) { - DEBUGP("ipt_helper: master ct %p has no helper\n", + DEBUGP("xt_helper: master ct %p has no helper\n", exp->expectant); goto out_unlock; } @@ -88,27 +91,28 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_helper_info *info = matchinfo; + const struct xt_helper_info *info = matchinfo; struct nf_conn *ct; enum ip_conntrack_info ctinfo; int ret = info->invert; ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); if (!ct) { - DEBUGP("ipt_helper: Eek! invalid conntrack?\n"); + DEBUGP("xt_helper: Eek! invalid conntrack?\n"); return ret; } if (!ct->master) { - DEBUGP("ipt_helper: conntrack %p has no master\n", ct); + DEBUGP("xt_helper: conntrack %p has no master\n", ct); return ret; } read_lock_bh(&nf_conntrack_lock); if (!ct->master->helper) { - DEBUGP("ipt_helper: master ct %p has no helper\n", + DEBUGP("xt_helper: master ct %p has no helper\n", exp->expectant); goto out_unlock; } @@ -128,23 +132,29 @@ out_unlock: #endif static int check(const char *tablename, - const struct ipt_ip *ip, + const void *inf, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - struct ipt_helper_info *info = matchinfo; + struct xt_helper_info *info = matchinfo; info->name[29] = '\0'; /* verify size */ - if (matchsize != IPT_ALIGN(sizeof(struct ipt_helper_info))) + if (matchsize != XT_ALIGN(sizeof(struct xt_helper_info))) return 0; return 1; } -static struct ipt_match helper_match = { +static struct xt_match helper_match = { + .name = "helper", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE, +}; +static struct xt_match helper6_match = { .name = "helper", .match = &match, .checkentry = &check, @@ -153,13 +163,24 @@ static struct ipt_match helper_match = { static int __init init(void) { - need_ip_conntrack(); - return ipt_register_match(&helper_match); + int ret; + need_conntrack(); + + ret = xt_register_match(AF_INET, &helper_match); + if (ret < 0) + return ret; + + ret = xt_register_match(AF_INET6, &helper6_match); + if (ret < 0) + xt_unregister_match(AF_INET, &helper_match); + + return ret; } static void __exit fini(void) { - ipt_unregister_match(&helper_match); + xt_unregister_match(AF_INET, &helper_match); + xt_unregister_match(AF_INET6, &helper6_match); } module_init(init); diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c new file mode 100644 index 000000000000..39c8faea63de --- /dev/null +++ b/net/netfilter/xt_length.c @@ -0,0 +1,99 @@ +/* Kernel module to match packet length. */ +/* (C) 1999-2001 James Morris <jmorros@intercode.com.au> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ip.h> + +#include <linux/netfilter/xt_length.h> +#include <linux/netfilter/x_tables.h> + +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_DESCRIPTION("IP tables packet length matching module"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_length"); +MODULE_ALIAS("ip6t_length"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct xt_length_info *info = matchinfo; + u_int16_t pktlen = ntohs(skb->nh.iph->tot_len); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; +} + +static int +match6(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct xt_length_info *info = matchinfo; + u_int16_t pktlen = ntohs(skb->nh.ipv6h->payload_len) + sizeof(struct ipv6hdr); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const void *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != XT_ALIGN(sizeof(struct xt_length_info))) + return 0; + + return 1; +} + +static struct xt_match length_match = { + .name = "length", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; +static struct xt_match length6_match = { + .name = "length", + .match = &match6, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int ret; + ret = xt_register_match(AF_INET, &length_match); + if (ret) + return ret; + ret = xt_register_match(AF_INET6, &length6_match); + if (ret) + xt_unregister_match(AF_INET, &length_match); + + return ret; +} + +static void __exit fini(void) +{ + xt_unregister_match(AF_INET, &length_match); + xt_unregister_match(AF_INET6, &length6_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/netfilter/xt_limit.c index 0c24dcc703a5..15e40506bc3a 100644 --- a/net/ipv4/netfilter/ipt_limit.c +++ b/net/netfilter/xt_limit.c @@ -18,12 +18,14 @@ #include <linux/spinlock.h> #include <linux/interrupt.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_limit.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_limit.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); MODULE_DESCRIPTION("iptables rate limit match"); +MODULE_ALIAS("ipt_limit"); +MODULE_ALIAS("ip6t_limit"); /* The algorithm used is the Simple Token Bucket Filter (TBF) * see net/sched/sch_tbf.c in the linux source tree @@ -68,9 +70,10 @@ ipt_limit_match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master; + struct xt_rateinfo *r = ((struct xt_rateinfo *)matchinfo)->master; unsigned long now = jiffies; spin_lock_bh(&limit_lock); @@ -96,32 +99,32 @@ user2credits(u_int32_t user) /* If multiplying would overflow... */ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) /* Divide first. */ - return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + return (user / XT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; - return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE; + return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE; } static int ipt_limit_checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *inf, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - struct ipt_rateinfo *r = matchinfo; + struct xt_rateinfo *r = matchinfo; - if (matchsize != IPT_ALIGN(sizeof(struct ipt_rateinfo))) + if (matchsize != XT_ALIGN(sizeof(struct xt_rateinfo))) return 0; /* Check for overflow. */ if (r->burst == 0 || user2credits(r->avg * r->burst) < user2credits(r->avg)) { - printk("Overflow in ipt_limit, try lower: %u/%u\n", + printk("Overflow in xt_limit, try lower: %u/%u\n", r->avg, r->burst); return 0; } - /* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies * + /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * 128. */ r->prev = jiffies; r->credit = user2credits(r->avg * r->burst); /* Credits full. */ @@ -134,7 +137,13 @@ ipt_limit_checkentry(const char *tablename, return 1; } -static struct ipt_match ipt_limit_reg = { +static struct xt_match ipt_limit_reg = { + .name = "limit", + .match = ipt_limit_match, + .checkentry = ipt_limit_checkentry, + .me = THIS_MODULE, +}; +static struct xt_match limit6_reg = { .name = "limit", .match = ipt_limit_match, .checkentry = ipt_limit_checkentry, @@ -143,14 +152,23 @@ static struct ipt_match ipt_limit_reg = { static int __init init(void) { - if (ipt_register_match(&ipt_limit_reg)) - return -EINVAL; - return 0; + int ret; + + ret = xt_register_match(AF_INET, &ipt_limit_reg); + if (ret) + return ret; + + ret = xt_register_match(AF_INET6, &limit6_reg); + if (ret) + xt_unregister_match(AF_INET, &ipt_limit_reg); + + return ret; } static void __exit fini(void) { - ipt_unregister_match(&ipt_limit_reg); + xt_unregister_match(AF_INET, &ipt_limit_reg); + xt_unregister_match(AF_INET6, &limit6_reg); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/netfilter/xt_mac.c index 11a459e33f25..0461dcb5fc7a 100644 --- a/net/ipv4/netfilter/ipt_mac.c +++ b/net/netfilter/xt_mac.c @@ -11,13 +11,17 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/if_ether.h> +#include <linux/etherdevice.h> -#include <linux/netfilter_ipv4/ipt_mac.h> -#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/xt_mac.h> +#include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables mac matching module"); +MODULE_ALIAS("ipt_mac"); +MODULE_ALIAS("ip6t_mac"); static int match(const struct sk_buff *skb, @@ -25,21 +29,22 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_mac_info *info = matchinfo; + const struct xt_mac_info *info = matchinfo; /* Is mac pointer valid? */ return (skb->mac.raw >= skb->head && (skb->mac.raw + ETH_HLEN) <= skb->data /* If so, compare... */ - && ((memcmp(eth_hdr(skb)->h_source, info->srcaddr, ETH_ALEN) - == 0) ^ info->invert)); + && ((!compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr)) + ^ info->invert)); } static int ipt_mac_checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *inf, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) @@ -48,17 +53,23 @@ ipt_mac_checkentry(const char *tablename, if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD))) { - printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); + printk("xt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); return 0; } - if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info))) + if (matchsize != XT_ALIGN(sizeof(struct xt_mac_info))) return 0; return 1; } -static struct ipt_match mac_match = { +static struct xt_match mac_match = { + .name = "mac", + .match = &match, + .checkentry = &ipt_mac_checkentry, + .me = THIS_MODULE, +}; +static struct xt_match mac6_match = { .name = "mac", .match = &match, .checkentry = &ipt_mac_checkentry, @@ -67,12 +78,22 @@ static struct ipt_match mac_match = { static int __init init(void) { - return ipt_register_match(&mac_match); + int ret; + ret = xt_register_match(AF_INET, &mac_match); + if (ret) + return ret; + + ret = xt_register_match(AF_INET6, &mac6_match); + if (ret) + xt_unregister_match(AF_INET, &mac_match); + + return ret; } static void __exit fini(void) { - ipt_unregister_match(&mac_match); + xt_unregister_match(AF_INET, &mac_match); + xt_unregister_match(AF_INET6, &mac6_match); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/netfilter/xt_mark.c index 00bef6cdd3f8..2a0ac62b72c8 100644 --- a/net/ipv4/netfilter/ipt_mark.c +++ b/net/netfilter/xt_mark.c @@ -10,12 +10,14 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/netfilter_ipv4/ipt_mark.h> -#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter/xt_mark.h> +#include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("iptables mark matching module"); +MODULE_ALIAS("ipt_mark"); +MODULE_ALIAS("ip6t_mark"); static int match(const struct sk_buff *skb, @@ -23,23 +25,24 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_mark_info *info = matchinfo; + const struct xt_mark_info *info = matchinfo; return ((skb->nfmark & info->mask) == info->mark) ^ info->invert; } static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *entry, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo; + struct xt_mark_info *minfo = (struct xt_mark_info *) matchinfo; - if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) + if (matchsize != XT_ALIGN(sizeof(struct xt_mark_info))) return 0; if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { @@ -50,7 +53,14 @@ checkentry(const char *tablename, return 1; } -static struct ipt_match mark_match = { +static struct xt_match mark_match = { + .name = "mark", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static struct xt_match mark6_match = { .name = "mark", .match = &match, .checkentry = &checkentry, @@ -59,12 +69,22 @@ static struct ipt_match mark_match = { static int __init init(void) { - return ipt_register_match(&mark_match); + int ret; + ret = xt_register_match(AF_INET, &mark_match); + if (ret) + return ret; + + ret = xt_register_match(AF_INET6, &mark6_match); + if (ret) + xt_unregister_match(AF_INET, &mark_match); + + return ret; } static void __exit fini(void) { - ipt_unregister_match(&mark_match); + xt_unregister_match(AF_INET, &mark_match); + xt_unregister_match(AF_INET6, &mark6_match); } module_init(init); diff --git a/net/ipv6/netfilter/ip6t_physdev.c b/net/netfilter/xt_physdev.c index 71515c86ece1..19bb57c14dfe 100644 --- a/net/ipv6/netfilter/ip6t_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -10,8 +10,8 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/netfilter_ipv6/ip6t_physdev.h> -#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/xt_physdev.h> +#include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge.h> #define MATCH 1 #define NOMATCH 0 @@ -19,6 +19,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); MODULE_DESCRIPTION("iptables bridge physical device match module"); +MODULE_ALIAS("ipt_physdev"); +MODULE_ALIAS("ip6t_physdev"); static int match(const struct sk_buff *skb, @@ -31,7 +33,7 @@ match(const struct sk_buff *skb, { int i; static const char nulldevname[IFNAMSIZ]; - const struct ip6t_physdev_info *info = matchinfo; + const struct xt_physdev_info *info = matchinfo; unsigned int ret; const char *indev, *outdev; struct nf_bridge_info *nf_bridge; @@ -41,37 +43,37 @@ match(const struct sk_buff *skb, * the destination device will be a bridge. */ if (!(nf_bridge = skb->nf_bridge)) { /* Return MATCH if the invert flags of the used options are on */ - if ((info->bitmask & IP6T_PHYSDEV_OP_BRIDGED) && - !(info->invert & IP6T_PHYSDEV_OP_BRIDGED)) + if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && + !(info->invert & XT_PHYSDEV_OP_BRIDGED)) return NOMATCH; - if ((info->bitmask & IP6T_PHYSDEV_OP_ISIN) && - !(info->invert & IP6T_PHYSDEV_OP_ISIN)) + if ((info->bitmask & XT_PHYSDEV_OP_ISIN) && + !(info->invert & XT_PHYSDEV_OP_ISIN)) return NOMATCH; - if ((info->bitmask & IP6T_PHYSDEV_OP_ISOUT) && - !(info->invert & IP6T_PHYSDEV_OP_ISOUT)) + if ((info->bitmask & XT_PHYSDEV_OP_ISOUT) && + !(info->invert & XT_PHYSDEV_OP_ISOUT)) return NOMATCH; - if ((info->bitmask & IP6T_PHYSDEV_OP_IN) && - !(info->invert & IP6T_PHYSDEV_OP_IN)) + if ((info->bitmask & XT_PHYSDEV_OP_IN) && + !(info->invert & XT_PHYSDEV_OP_IN)) return NOMATCH; - if ((info->bitmask & IP6T_PHYSDEV_OP_OUT) && - !(info->invert & IP6T_PHYSDEV_OP_OUT)) + if ((info->bitmask & XT_PHYSDEV_OP_OUT) && + !(info->invert & XT_PHYSDEV_OP_OUT)) return NOMATCH; return MATCH; } /* This only makes sense in the FORWARD and POSTROUTING chains */ - if ((info->bitmask & IP6T_PHYSDEV_OP_BRIDGED) && + if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && (!!(nf_bridge->mask & BRNF_BRIDGED) ^ - !(info->invert & IP6T_PHYSDEV_OP_BRIDGED))) + !(info->invert & XT_PHYSDEV_OP_BRIDGED))) return NOMATCH; - if ((info->bitmask & IP6T_PHYSDEV_OP_ISIN && - (!nf_bridge->physindev ^ !!(info->invert & IP6T_PHYSDEV_OP_ISIN))) || - (info->bitmask & IP6T_PHYSDEV_OP_ISOUT && - (!nf_bridge->physoutdev ^ !!(info->invert & IP6T_PHYSDEV_OP_ISOUT)))) + if ((info->bitmask & XT_PHYSDEV_OP_ISIN && + (!nf_bridge->physindev ^ !!(info->invert & XT_PHYSDEV_OP_ISIN))) || + (info->bitmask & XT_PHYSDEV_OP_ISOUT && + (!nf_bridge->physoutdev ^ !!(info->invert & XT_PHYSDEV_OP_ISOUT)))) return NOMATCH; - if (!(info->bitmask & IP6T_PHYSDEV_OP_IN)) + if (!(info->bitmask & XT_PHYSDEV_OP_IN)) goto match_outdev; indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { @@ -80,11 +82,11 @@ match(const struct sk_buff *skb, & ((const unsigned int *)info->in_mask)[i]; } - if ((ret == 0) ^ !(info->invert & IP6T_PHYSDEV_OP_IN)) + if ((ret == 0) ^ !(info->invert & XT_PHYSDEV_OP_IN)) return NOMATCH; match_outdev: - if (!(info->bitmask & IP6T_PHYSDEV_OP_OUT)) + if (!(info->bitmask & XT_PHYSDEV_OP_OUT)) return MATCH; outdev = nf_bridge->physoutdev ? nf_bridge->physoutdev->name : nulldevname; @@ -94,27 +96,34 @@ match_outdev: & ((const unsigned int *)info->out_mask)[i]; } - return (ret != 0) ^ !(info->invert & IP6T_PHYSDEV_OP_OUT); + return (ret != 0) ^ !(info->invert & XT_PHYSDEV_OP_OUT); } static int checkentry(const char *tablename, - const struct ip6t_ip6 *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - const struct ip6t_physdev_info *info = matchinfo; + const struct xt_physdev_info *info = matchinfo; - if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_physdev_info))) + if (matchsize != XT_ALIGN(sizeof(struct xt_physdev_info))) return 0; - if (!(info->bitmask & IP6T_PHYSDEV_OP_MASK) || - info->bitmask & ~IP6T_PHYSDEV_OP_MASK) + if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || + info->bitmask & ~XT_PHYSDEV_OP_MASK) return 0; return 1; } -static struct ip6t_match physdev_match = { +static struct xt_match physdev_match = { + .name = "physdev", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static struct xt_match physdev6_match = { .name = "physdev", .match = &match, .checkentry = &checkentry, @@ -123,12 +132,23 @@ static struct ip6t_match physdev_match = { static int __init init(void) { - return ip6t_register_match(&physdev_match); + int ret; + + ret = xt_register_match(AF_INET, &physdev_match); + if (ret < 0) + return ret; + + ret = xt_register_match(AF_INET6, &physdev6_match); + if (ret < 0) + xt_unregister_match(AF_INET, &physdev_match); + + return ret; } static void __exit fini(void) { - ip6t_unregister_match(&physdev_match); + xt_unregister_match(AF_INET, &physdev_match); + xt_unregister_match(AF_INET6, &physdev6_match); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_pkttype.c b/net/netfilter/xt_pkttype.c index 8ddb1dc5e5ae..ab1b2630f97d 100644 --- a/net/ipv4/netfilter/ipt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -10,60 +10,72 @@ #include <linux/if_ether.h> #include <linux/if_packet.h> -#include <linux/netfilter_ipv4/ipt_pkttype.h> -#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter/xt_pkttype.h> +#include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>"); MODULE_DESCRIPTION("IP tables match to match on linklayer packet type"); +MODULE_ALIAS("ipt_pkttype"); +MODULE_ALIAS("ip6t_pkttype"); static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_pkttype_info *info = matchinfo; + const struct xt_pkttype_info *info = matchinfo; - return (skb->pkt_type == info->pkttype) ^ info->invert; + return (skb->pkt_type == info->pkttype) ^ info->invert; } static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { -/* - if (hook_mask - & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) - | (1 << NF_IP_FORWARD))) { - printk("ipt_pkttype: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); - return 0; - } -*/ - if (matchsize != IPT_ALIGN(sizeof(struct ipt_pkttype_info))) + if (matchsize != XT_ALIGN(sizeof(struct xt_pkttype_info))) return 0; return 1; } -static struct ipt_match pkttype_match = { +static struct xt_match pkttype_match = { .name = "pkttype", .match = &match, .checkentry = &checkentry, .me = THIS_MODULE, }; +static struct xt_match pkttype6_match = { + .name = "pkttype", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + static int __init init(void) { - return ipt_register_match(&pkttype_match); + int ret; + ret = xt_register_match(AF_INET, &pkttype_match); + if (ret) + return ret; + + ret = xt_register_match(AF_INET6, &pkttype6_match); + if (ret) + xt_unregister_match(AF_INET, &pkttype_match); + + return ret; } static void __exit fini(void) { - ipt_unregister_match(&pkttype_match); + xt_unregister_match(AF_INET, &pkttype_match); + xt_unregister_match(AF_INET6, &pkttype6_match); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_realm.c b/net/netfilter/xt_realm.c index 54a6897ebaa6..2b7e1781d34d 100644 --- a/net/ipv4/netfilter/ipt_realm.c +++ b/net/netfilter/xt_realm.c @@ -14,12 +14,14 @@ #include <linux/netdevice.h> #include <net/route.h> -#include <linux/netfilter_ipv4/ipt_realm.h> -#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/xt_realm.h> +#include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>"); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("iptables realm match"); +MODULE_DESCRIPTION("X_tables realm match"); +MODULE_ALIAS("ipt_realm"); static int match(const struct sk_buff *skb, @@ -27,16 +29,17 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_realm_info *info = matchinfo; + const struct xt_realm_info *info = matchinfo; struct dst_entry *dst = skb->dst; return (info->id == (dst->tclassid & info->mask)) ^ info->invert; } static int check(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) @@ -44,18 +47,18 @@ static int check(const char *tablename, if (hook_mask & ~((1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN))) { - printk("ipt_realm: only valid for POST_ROUTING, LOCAL_OUT, " + printk("xt_realm: only valid for POST_ROUTING, LOCAL_OUT, " "LOCAL_IN or FORWARD.\n"); return 0; } - if (matchsize != IPT_ALIGN(sizeof(struct ipt_realm_info))) { - printk("ipt_realm: invalid matchsize.\n"); + if (matchsize != XT_ALIGN(sizeof(struct xt_realm_info))) { + printk("xt_realm: invalid matchsize.\n"); return 0; } return 1; } -static struct ipt_match realm_match = { +static struct xt_match realm_match = { .name = "realm", .match = match, .checkentry = check, @@ -64,12 +67,12 @@ static struct ipt_match realm_match = { static int __init init(void) { - return ipt_register_match(&realm_match); + return xt_register_match(AF_INET, &realm_match); } static void __exit fini(void) { - ipt_unregister_match(&realm_match); + xt_unregister_match(AF_INET, &realm_match); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_sctp.c b/net/netfilter/xt_sctp.c index fe2b327bcaa4..10fbfc5ba758 100644 --- a/net/ipv4/netfilter/ipt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -1,10 +1,18 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <net/ip.h> +#include <net/ipv6.h> #include <linux/sctp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_sctp.h> #include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_sctp.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Match for SCTP protocol packets"); +MODULE_ALIAS("ipt_sctp"); #ifdef DEBUG_SCTP #define duprintf(format, args...) printk(format , ## args) @@ -16,7 +24,7 @@ || (!!((invflag) & (option)) ^ (cond))) static int -match_flags(const struct ipt_sctp_flag_info *flag_info, +match_flags(const struct xt_sctp_flag_info *flag_info, const int flag_count, u_int8_t chunktype, u_int8_t chunkflags) @@ -32,15 +40,15 @@ match_flags(const struct ipt_sctp_flag_info *flag_info, return 1; } -static int +static inline int match_packet(const struct sk_buff *skb, + unsigned int offset, const u_int32_t *chunkmap, int chunk_match_type, - const struct ipt_sctp_flag_info *flag_info, + const struct xt_sctp_flag_info *flag_info, const int flag_count, int *hotdrop) { - int offset; u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)]; sctp_chunkhdr_t _sch, *sch; @@ -52,7 +60,6 @@ match_packet(const struct sk_buff *skb, SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap); } - offset = skb->nh.iph->ihl * 4 + sizeof (sctp_sctphdr_t); do { sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch); if (sch == NULL) { @@ -118,19 +125,20 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_sctp_info *info; + const struct xt_sctp_info *info; sctp_sctphdr_t _sh, *sh; - info = (const struct ipt_sctp_info *)matchinfo; + info = (const struct xt_sctp_info *)matchinfo; if (offset) { duprintf("Dropping non-first fragment.. FIXME\n"); return 0; } - sh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_sh), &_sh); + sh = skb_header_pointer(skb, protoff, sizeof(_sh), &_sh); if (sh == NULL) { duprintf("Dropping evil TCP offset=0 tinygram.\n"); *hotdrop = 1; @@ -140,64 +148,103 @@ match(const struct sk_buff *skb, return SCCHECK(((ntohs(sh->source) >= info->spts[0]) && (ntohs(sh->source) <= info->spts[1])), - IPT_SCTP_SRC_PORTS, info->flags, info->invflags) + XT_SCTP_SRC_PORTS, info->flags, info->invflags) && SCCHECK(((ntohs(sh->dest) >= info->dpts[0]) && (ntohs(sh->dest) <= info->dpts[1])), - IPT_SCTP_DEST_PORTS, info->flags, info->invflags) - && SCCHECK(match_packet(skb, info->chunkmap, info->chunk_match_type, + XT_SCTP_DEST_PORTS, info->flags, info->invflags) + && SCCHECK(match_packet(skb, protoff, + info->chunkmap, info->chunk_match_type, info->flag_info, info->flag_count, hotdrop), - IPT_SCTP_CHUNK_TYPES, info->flags, info->invflags); + XT_SCTP_CHUNK_TYPES, info->flags, info->invflags); } static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *inf, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct xt_sctp_info *info; + const struct ipt_ip *ip = inf; + + info = (const struct xt_sctp_info *)matchinfo; + + return ip->proto == IPPROTO_SCTP + && !(ip->invflags & XT_INV_PROTO) + && matchsize == XT_ALIGN(sizeof(struct xt_sctp_info)) + && !(info->flags & ~XT_SCTP_VALID_FLAGS) + && !(info->invflags & ~XT_SCTP_VALID_FLAGS) + && !(info->invflags & ~info->flags) + && ((!(info->flags & XT_SCTP_CHUNK_TYPES)) || + (info->chunk_match_type & + (SCTP_CHUNK_MATCH_ALL + | SCTP_CHUNK_MATCH_ANY + | SCTP_CHUNK_MATCH_ONLY))); +} + +static int +checkentry6(const char *tablename, + const void *inf, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - const struct ipt_sctp_info *info; + const struct xt_sctp_info *info; + const struct ip6t_ip6 *ip = inf; - info = (const struct ipt_sctp_info *)matchinfo; + info = (const struct xt_sctp_info *)matchinfo; return ip->proto == IPPROTO_SCTP - && !(ip->invflags & IPT_INV_PROTO) - && matchsize == IPT_ALIGN(sizeof(struct ipt_sctp_info)) - && !(info->flags & ~IPT_SCTP_VALID_FLAGS) - && !(info->invflags & ~IPT_SCTP_VALID_FLAGS) + && !(ip->invflags & XT_INV_PROTO) + && matchsize == XT_ALIGN(sizeof(struct xt_sctp_info)) + && !(info->flags & ~XT_SCTP_VALID_FLAGS) + && !(info->invflags & ~XT_SCTP_VALID_FLAGS) && !(info->invflags & ~info->flags) - && ((!(info->flags & IPT_SCTP_CHUNK_TYPES)) || + && ((!(info->flags & XT_SCTP_CHUNK_TYPES)) || (info->chunk_match_type & (SCTP_CHUNK_MATCH_ALL | SCTP_CHUNK_MATCH_ANY | SCTP_CHUNK_MATCH_ONLY))); } -static struct ipt_match sctp_match = + +static struct xt_match sctp_match = { - .list = { NULL, NULL}, .name = "sctp", .match = &match, .checkentry = &checkentry, - .destroy = NULL, + .me = THIS_MODULE +}; +static struct xt_match sctp6_match = +{ + .name = "sctp", + .match = &match, + .checkentry = &checkentry6, .me = THIS_MODULE }; + static int __init init(void) { - return ipt_register_match(&sctp_match); + int ret; + ret = xt_register_match(AF_INET, &sctp_match); + if (ret) + return ret; + + ret = xt_register_match(AF_INET6, &sctp6_match); + if (ret) + xt_unregister_match(AF_INET, &sctp_match); + + return ret; } static void __exit fini(void) { - ipt_unregister_match(&sctp_match); + xt_unregister_match(AF_INET6, &sctp6_match); + xt_unregister_match(AF_INET, &sctp_match); } module_init(init); module_exit(fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Kiran Kumar Immidi"); -MODULE_DESCRIPTION("Match for SCTP protocol packets"); - diff --git a/net/ipv4/netfilter/ipt_state.c b/net/netfilter/xt_state.c index 4d7f16b70cec..39ce808d40ef 100644 --- a/net/ipv4/netfilter/ipt_state.c +++ b/net/netfilter/xt_state.c @@ -1,7 +1,7 @@ /* Kernel module to match connection tracking information. */ /* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -11,12 +11,14 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack_compat.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_state.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_state.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); -MODULE_DESCRIPTION("iptables connection tracking state match module"); +MODULE_DESCRIPTION("ip[6]_tables connection tracking state match module"); +MODULE_ALIAS("ipt_state"); +MODULE_ALIAS("ip6t_state"); static int match(const struct sk_buff *skb, @@ -24,35 +26,43 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_state_info *sinfo = matchinfo; + const struct xt_state_info *sinfo = matchinfo; enum ip_conntrack_info ctinfo; unsigned int statebit; if (nf_ct_is_untracked(skb)) - statebit = IPT_STATE_UNTRACKED; + statebit = XT_STATE_UNTRACKED; else if (!nf_ct_get_ctinfo(skb, &ctinfo)) - statebit = IPT_STATE_INVALID; + statebit = XT_STATE_INVALID; else - statebit = IPT_STATE_BIT(ctinfo); + statebit = XT_STATE_BIT(ctinfo); return (sinfo->statemask & statebit); } static int check(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - if (matchsize != IPT_ALIGN(sizeof(struct ipt_state_info))) + if (matchsize != XT_ALIGN(sizeof(struct xt_state_info))) return 0; return 1; } -static struct ipt_match state_match = { +static struct xt_match state_match = { + .name = "state", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE, +}; + +static struct xt_match state6_match = { .name = "state", .match = &match, .checkentry = &check, @@ -61,13 +71,25 @@ static struct ipt_match state_match = { static int __init init(void) { - need_ip_conntrack(); - return ipt_register_match(&state_match); + int ret; + + need_conntrack(); + + ret = xt_register_match(AF_INET, &state_match); + if (ret < 0) + return ret; + + ret = xt_register_match(AF_INET6, &state6_match); + if (ret < 0) + xt_unregister_match(AF_INET,&state_match); + + return ret; } static void __exit fini(void) { - ipt_unregister_match(&state_match); + xt_unregister_match(AF_INET, &state_match); + xt_unregister_match(AF_INET6, &state6_match); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_string.c b/net/netfilter/xt_string.c index b5def204d798..7c7d5c8807d6 100644 --- a/net/ipv4/netfilter/ipt_string.c +++ b/net/netfilter/xt_string.c @@ -11,23 +11,26 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_string.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_string.h> #include <linux/textsearch.h> MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>"); MODULE_DESCRIPTION("IP tables string match module"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_string"); +MODULE_ALIAS("ip6t_string"); static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { struct ts_state state; - struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo; + struct xt_string_info *conf = (struct xt_string_info *) matchinfo; memset(&state, 0, sizeof(struct ts_state)); @@ -36,18 +39,18 @@ static int match(const struct sk_buff *skb, != UINT_MAX) && !conf->invert; } -#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m) +#define STRING_TEXT_PRIV(m) ((struct xt_string_info *) m) static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *ip, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - struct ipt_string_info *conf = matchinfo; + struct xt_string_info *conf = matchinfo; struct ts_config *ts_conf; - if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info))) + if (matchsize != XT_ALIGN(sizeof(struct xt_string_info))) return 0; /* Damn, can't handle this case properly with iptables... */ @@ -69,7 +72,14 @@ static void destroy(void *matchinfo, unsigned int matchsize) textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config); } -static struct ipt_match string_match = { +static struct xt_match string_match = { + .name = "string", + .match = match, + .checkentry = checkentry, + .destroy = destroy, + .me = THIS_MODULE +}; +static struct xt_match string6_match = { .name = "string", .match = match, .checkentry = checkentry, @@ -79,12 +89,22 @@ static struct ipt_match string_match = { static int __init init(void) { - return ipt_register_match(&string_match); + int ret; + + ret = xt_register_match(AF_INET, &string_match); + if (ret) + return ret; + ret = xt_register_match(AF_INET6, &string6_match); + if (ret) + xt_unregister_match(AF_INET, &string_match); + + return ret; } static void __exit fini(void) { - ipt_unregister_match(&string_match); + xt_unregister_match(AF_INET, &string_match); + xt_unregister_match(AF_INET6, &string6_match); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 4dc9b16ab4a3..acf7f533e9f1 100644 --- a/net/ipv4/netfilter/ipt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -1,6 +1,7 @@ /* Kernel module to match TCP MSS values. */ /* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * Portions (C) 2005 by Harald Welte <laforge@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -11,19 +12,24 @@ #include <linux/skbuff.h> #include <net/tcp.h> -#include <linux/netfilter_ipv4/ipt_tcpmss.h> +#include <linux/netfilter/xt_tcpmss.h> +#include <linux/netfilter/x_tables.h> + #include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> #define TH_SYN 0x02 MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("iptables TCP MSS match module"); +MODULE_ALIAS("ipt_tcpmss"); /* Returns 1 if the mss option is set and matched by the range, 0 otherwise */ static inline int mssoption_match(u_int16_t min, u_int16_t max, const struct sk_buff *skb, + unsigned int protoff, int invert, int *hotdrop) { @@ -33,8 +39,7 @@ mssoption_match(u_int16_t min, u_int16_t max, unsigned int i, optlen; /* If we don't have the whole header, drop packet. */ - th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, - sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); if (th == NULL) goto dropit; @@ -47,8 +52,7 @@ mssoption_match(u_int16_t min, u_int16_t max, goto out; /* Truncated options. */ - op = skb_header_pointer(skb, skb->nh.iph->ihl * 4 + sizeof(*th), - optlen, _opt); + op = skb_header_pointer(skb, protoff + sizeof(*th), optlen, _opt); if (op == NULL) goto dropit; @@ -79,22 +83,24 @@ match(const struct sk_buff *skb, const struct net_device *out, const void *matchinfo, int offset, + unsigned int protoff, int *hotdrop) { - const struct ipt_tcpmss_match_info *info = matchinfo; + const struct xt_tcpmss_match_info *info = matchinfo; - return mssoption_match(info->mss_min, info->mss_max, skb, + return mssoption_match(info->mss_min, info->mss_max, skb, protoff, info->invert, hotdrop); } static int checkentry(const char *tablename, - const struct ipt_ip *ip, + const void *ipinfo, void *matchinfo, unsigned int matchsize, unsigned int hook_mask) { - if (matchsize != IPT_ALIGN(sizeof(struct ipt_tcpmss_match_info))) + const struct ipt_ip *ip = ipinfo; + if (matchsize != XT_ALIGN(sizeof(struct xt_tcpmss_match_info))) return 0; /* Must specify -p tcp */ @@ -106,21 +112,60 @@ checkentry(const char *tablename, return 1; } -static struct ipt_match tcpmss_match = { +static int +checkentry6(const char *tablename, + const void *ipinfo, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_ip6 *ip = ipinfo; + + if (matchsize != XT_ALIGN(sizeof(struct xt_tcpmss_match_info))) + return 0; + + /* Must specify -p tcp */ + if (ip->proto != IPPROTO_TCP || (ip->invflags & XT_INV_PROTO)) { + printk("tcpmss: Only works on TCP packets\n"); + return 0; + } + + return 1; +} + +static struct xt_match tcpmss_match = { .name = "tcpmss", .match = &match, .checkentry = &checkentry, .me = THIS_MODULE, }; +static struct xt_match tcpmss6_match = { + .name = "tcpmss", + .match = &match, + .checkentry = &checkentry6, + .me = THIS_MODULE, +}; + + static int __init init(void) { - return ipt_register_match(&tcpmss_match); + int ret; + ret = xt_register_match(AF_INET, &tcpmss_match); + if (ret) + return ret; + + ret = xt_register_match(AF_INET6, &tcpmss6_match); + if (ret) + xt_unregister_match(AF_INET, &tcpmss_match); + + return ret; } static void __exit fini(void) { - ipt_unregister_match(&tcpmss_match); + xt_unregister_match(AF_INET6, &tcpmss6_match); + xt_unregister_match(AF_INET, &tcpmss_match); } module_init(init); diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c new file mode 100644 index 000000000000..669c8113cc60 --- /dev/null +++ b/net/netfilter/xt_tcpudp.c @@ -0,0 +1,334 @@ +#include <linux/types.h> +#include <linux/module.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_DESCRIPTION("x_tables match for TCP and UDP, supports IPv4 and IPv6"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("xt_tcp"); +MODULE_ALIAS("xt_udp"); +MODULE_ALIAS("ipt_udp"); +MODULE_ALIAS("ipt_tcp"); +MODULE_ALIAS("ip6t_udp"); +MODULE_ALIAS("ip6t_tcp"); + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +static inline int +port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) +{ + int ret; + + ret = (port >= min && port <= max) ^ invert; + return ret; +} + +static int +tcp_find_option(u_int8_t option, + const struct sk_buff *skb, + unsigned int protoff, + unsigned int optlen, + int invert, + int *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; + unsigned int i; + + duprintf("tcp_match: finding option\n"); + + if (!optlen) + return invert; + + /* If we don't have the whole header, drop packet. */ + op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr), + optlen, _opt); + if (op == NULL) { + *hotdrop = 1; + return 0; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) return !invert; + if (op[i] < 2) i++; + else i += op[i+1]?:1; + } + + return invert; +} + +static int +tcp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + const struct xt_tcp *tcpinfo = matchinfo; + + if (offset) { + /* To quote Alan: + + Don't allow a fragment of TCP 8 bytes in. Nobody normal + causes this. Its a cracker trying to break in by doing a + flag overwrite to pass the direction checks. + */ + if (offset == 1) { + duprintf("Dropping evil TCP offset=1 frag.\n"); + *hotdrop = 1; + } + /* Must not be a fragment. */ + return 0; + } + +#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) + + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], + ntohs(th->source), + !!(tcpinfo->invflags & XT_TCP_INV_SRCPT))) + return 0; + if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], + ntohs(th->dest), + !!(tcpinfo->invflags & XT_TCP_INV_DSTPT))) + return 0; + if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) + == tcpinfo->flg_cmp, + XT_TCP_INV_FLAGS)) + return 0; + if (tcpinfo->option) { + if (th->doff * 4 < sizeof(_tcph)) { + *hotdrop = 1; + return 0; + } + if (!tcp_find_option(tcpinfo->option, skb, protoff, + th->doff*4 - sizeof(_tcph), + tcpinfo->invflags & XT_TCP_INV_OPTION, + hotdrop)) + return 0; + } + return 1; +} + +/* Called when user tries to insert an entry of this type. */ +static int +tcp_checkentry(const char *tablename, + const void *info, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_ip *ip = info; + const struct xt_tcp *tcpinfo = matchinfo; + + /* Must specify proto == TCP, and no unknown invflags */ + return ip->proto == IPPROTO_TCP + && !(ip->invflags & XT_INV_PROTO) + && matchsize == XT_ALIGN(sizeof(struct xt_tcp)) + && !(tcpinfo->invflags & ~XT_TCP_INV_MASK); +} + +/* Called when user tries to insert an entry of this type. */ +static int +tcp6_checkentry(const char *tablename, + const void *entry, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_ip6 *ipv6 = entry; + const struct xt_tcp *tcpinfo = matchinfo; + + /* Must specify proto == TCP, and no unknown invflags */ + return ipv6->proto == IPPROTO_TCP + && !(ipv6->invflags & XT_INV_PROTO) + && matchsize == XT_ALIGN(sizeof(struct xt_tcp)) + && !(tcpinfo->invflags & ~XT_TCP_INV_MASK); +} + + +static int +udp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct udphdr _udph, *uh; + const struct xt_udp *udpinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + uh = skb_header_pointer(skb, protoff, sizeof(_udph), &_udph); + if (uh == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil UDP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return port_match(udpinfo->spts[0], udpinfo->spts[1], + ntohs(uh->source), + !!(udpinfo->invflags & XT_UDP_INV_SRCPT)) + && port_match(udpinfo->dpts[0], udpinfo->dpts[1], + ntohs(uh->dest), + !!(udpinfo->invflags & XT_UDP_INV_DSTPT)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +udp_checkentry(const char *tablename, + const void *info, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ipt_ip *ip = info; + const struct xt_udp *udpinfo = matchinfo; + + /* Must specify proto == UDP, and no unknown invflags */ + if (ip->proto != IPPROTO_UDP || (ip->invflags & XT_INV_PROTO)) { + duprintf("ipt_udp: Protocol %u != %u\n", ip->proto, + IPPROTO_UDP); + return 0; + } + if (matchinfosize != XT_ALIGN(sizeof(struct xt_udp))) { + duprintf("ipt_udp: matchsize %u != %u\n", + matchinfosize, XT_ALIGN(sizeof(struct xt_udp))); + return 0; + } + if (udpinfo->invflags & ~XT_UDP_INV_MASK) { + duprintf("ipt_udp: unknown flags %X\n", + udpinfo->invflags); + return 0; + } + + return 1; +} + +/* Called when user tries to insert an entry of this type. */ +static int +udp6_checkentry(const char *tablename, + const void *entry, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_ip6 *ipv6 = entry; + const struct xt_udp *udpinfo = matchinfo; + + /* Must specify proto == UDP, and no unknown invflags */ + if (ipv6->proto != IPPROTO_UDP || (ipv6->invflags & XT_INV_PROTO)) { + duprintf("ip6t_udp: Protocol %u != %u\n", ipv6->proto, + IPPROTO_UDP); + return 0; + } + if (matchinfosize != XT_ALIGN(sizeof(struct xt_udp))) { + duprintf("ip6t_udp: matchsize %u != %u\n", + matchinfosize, XT_ALIGN(sizeof(struct xt_udp))); + return 0; + } + if (udpinfo->invflags & ~XT_UDP_INV_MASK) { + duprintf("ip6t_udp: unknown flags %X\n", + udpinfo->invflags); + return 0; + } + + return 1; +} + +static struct xt_match tcp_matchstruct = { + .name = "tcp", + .match = &tcp_match, + .checkentry = &tcp_checkentry, + .me = THIS_MODULE, +}; +static struct xt_match tcp6_matchstruct = { + .name = "tcp", + .match = &tcp_match, + .checkentry = &tcp6_checkentry, + .me = THIS_MODULE, +}; + +static struct xt_match udp_matchstruct = { + .name = "udp", + .match = &udp_match, + .checkentry = &udp_checkentry, + .me = THIS_MODULE, +}; +static struct xt_match udp6_matchstruct = { + .name = "udp", + .match = &udp_match, + .checkentry = &udp6_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int ret; + ret = xt_register_match(AF_INET, &tcp_matchstruct); + if (ret) + return ret; + + ret = xt_register_match(AF_INET6, &tcp6_matchstruct); + if (ret) + goto out_unreg_tcp; + + ret = xt_register_match(AF_INET, &udp_matchstruct); + if (ret) + goto out_unreg_tcp6; + + ret = xt_register_match(AF_INET6, &udp6_matchstruct); + if (ret) + goto out_unreg_udp; + + return ret; + +out_unreg_udp: + xt_unregister_match(AF_INET, &tcp_matchstruct); +out_unreg_tcp6: + xt_unregister_match(AF_INET6, &tcp6_matchstruct); +out_unreg_tcp: + xt_unregister_match(AF_INET, &tcp_matchstruct); + return ret; +} + +static void __exit fini(void) +{ + xt_unregister_match(AF_INET6, &udp6_matchstruct); + xt_unregister_match(AF_INET, &udp_matchstruct); + xt_unregister_match(AF_INET6, &tcp6_matchstruct); + xt_unregister_match(AF_INET, &tcp_matchstruct); +} + +module_init(init); +module_exit(fini); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 96020d7087e8..2101b45d2ec6 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -24,6 +24,7 @@ #include <linux/config.h> #include <linux/module.h> +#include <linux/capability.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/signal.h> @@ -293,7 +294,7 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len) return 0; } -static struct proto_ops netlink_ops; +static const struct proto_ops netlink_ops; static int netlink_insert(struct sock *sk, u32 pid) { @@ -402,7 +403,7 @@ static int netlink_create(struct socket *sock, int protocol) groups = nl_table[protocol].groups; netlink_unlock_table(); - if ((err = __netlink_create(sock, protocol) < 0)) + if ((err = __netlink_create(sock, protocol)) < 0) goto out_module; nlk = nlk_sk(sock->sk); @@ -1422,7 +1423,7 @@ static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, while (skb->len >= nlmsg_total_size(0)) { nlh = (struct nlmsghdr *) skb->data; - if (skb->len < nlh->nlmsg_len) + if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return 0; total_len = min(NLMSG_ALIGN(nlh->nlmsg_len), skb->len); @@ -1656,7 +1657,7 @@ int netlink_unregister_notifier(struct notifier_block *nb) return notifier_chain_unregister(&netlink_chain, nb); } -static struct proto_ops netlink_ops = { +static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 287cfcc56951..4ae1538c54a9 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -222,11 +222,6 @@ int genl_register_family(struct genl_family *family) goto errout_locked; } - if (!try_module_get(family->owner)) { - err = -EBUSY; - goto errout_locked; - } - if (family->id == GENL_ID_GENERATE) { u16 newid = genl_generate_id(); @@ -283,7 +278,6 @@ int genl_unregister_family(struct genl_family *family) INIT_LIST_HEAD(&family->ops_list); genl_unlock(); - module_put(family->owner); kfree(family->attrbuf); genl_ctrl_event(CTRL_CMD_DELFAMILY, family); return 0; @@ -441,7 +435,7 @@ errout: } static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid, - int seq, int cmd) + int seq, u8 cmd) { struct sk_buff *skb; int err; @@ -535,7 +529,6 @@ static struct genl_family genl_ctrl = { .name = "nlctrl", .version = 0x1, .maxattr = CTRL_ATTR_MAX, - .owner = THIS_MODULE, }; static int __init genl_init(void) diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index e5d82d711cae..d44981f5a619 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -11,6 +11,7 @@ #include <linux/config.h> #include <linux/module.h> #include <linux/moduleparam.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -63,7 +64,7 @@ static unsigned short circuit = 0x101; static HLIST_HEAD(nr_list); static DEFINE_SPINLOCK(nr_list_lock); -static struct proto_ops nr_proto_ops; +static const struct proto_ops nr_proto_ops; /* * Socket removal during an interrupt is now safe. @@ -1166,10 +1167,11 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; int ret; - lock_sock(sk); switch (cmd) { case TIOCOUTQ: { long amount; + + lock_sock(sk); amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); if (amount < 0) amount = 0; @@ -1180,6 +1182,8 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case TIOCINQ: { struct sk_buff *skb; long amount = 0L; + + lock_sock(sk); /* These two are safe on a single CPU system as only user tasks fiddle here */ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; @@ -1188,6 +1192,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } case SIOCGSTAMP: + lock_sock(sk); ret = sock_get_timestamp(sk, argp); release_sock(sk); return ret; @@ -1202,21 +1207,17 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: - release_sock(sk); return -EINVAL; case SIOCADDRT: case SIOCDELRT: case SIOCNRDECOBS: - release_sock(sk); if (!capable(CAP_NET_ADMIN)) return -EPERM; return nr_rt_ioctl(cmd, argp); default: - release_sock(sk); - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } - release_sock(sk); return 0; } @@ -1337,7 +1338,7 @@ static struct net_proto_family nr_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops nr_proto_ops = { +static const struct proto_ops nr_proto_ops = { .family = PF_NETROM, .owner = THIS_MODULE, .release = nr_release, diff --git a/net/nonet.c b/net/nonet.c index e5241dceaa57..1230f0ae832e 100644 --- a/net/nonet.c +++ b/net/nonet.c @@ -14,11 +14,6 @@ #include <linux/init.h> #include <linux/kernel.h> -void __init sock_init(void) -{ - printk(KERN_INFO "Linux NoNET1.0 for Linux 2.6\n"); -} - static int sock_no_open(struct inode *irrelevant, struct file *dontcare) { return -ENXIO; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3e2462760413..ee93abc71cb8 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -53,6 +53,7 @@ #include <linux/types.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/capability.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> @@ -251,10 +252,10 @@ static void packet_sock_destruct(struct sock *sk) } -static struct proto_ops packet_ops; +static const struct proto_ops packet_ops; #ifdef CONFIG_SOCK_PACKET -static struct proto_ops packet_ops_spkt; +static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { @@ -1237,7 +1238,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) goto done; err = -ENOBUFS; - i = (struct packet_mclist *)kmalloc(sizeof(*i), GFP_KERNEL); + i = kmalloc(sizeof(*i), GFP_KERNEL); if (i == NULL) goto done; @@ -1521,7 +1522,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, #endif default: - return dev_ioctl(cmd, (void __user *)arg); + return -ENOIOCTLCMD; } return 0; } @@ -1784,7 +1785,7 @@ out: #ifdef CONFIG_SOCK_PACKET -static struct proto_ops packet_ops_spkt = { +static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, @@ -1806,7 +1807,7 @@ static struct proto_ops packet_ops_spkt = { }; #endif -static struct proto_ops packet_ops = { +static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 829fdbc4400b..ea65396d1619 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -9,7 +9,9 @@ * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net) * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi) */ + #include <linux/config.h> +#include <linux/capability.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/init.h> @@ -1320,7 +1322,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return 0; default: - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } return 0; diff --git a/net/rxrpc/connection.c b/net/rxrpc/connection.c index 2ba14a75dbbe..0e0a4553499f 100644 --- a/net/rxrpc/connection.c +++ b/net/rxrpc/connection.c @@ -220,6 +220,7 @@ int rxrpc_connection_lookup(struct rxrpc_peer *peer, { struct rxrpc_connection *conn, *candidate = NULL; struct list_head *_p; + struct sk_buff *pkt = msg->pkt; int ret, fresh = 0; __be32 x_epoch, x_connid; __be16 x_port, x_servid; @@ -229,10 +230,10 @@ int rxrpc_connection_lookup(struct rxrpc_peer *peer, _enter("%p{{%hu}},%u,%hu", peer, peer->trans->port, - ntohs(msg->pkt->h.uh->source), + ntohs(pkt->h.uh->source), ntohs(msg->hdr.serviceId)); - x_port = msg->pkt->h.uh->source; + x_port = pkt->h.uh->source; x_epoch = msg->hdr.epoch; x_clflag = msg->hdr.flags & RXRPC_CLIENT_INITIATED; x_connid = htonl(ntohl(msg->hdr.cid) & RXRPC_CIDMASK); @@ -267,7 +268,7 @@ int rxrpc_connection_lookup(struct rxrpc_peer *peer, /* fill in the specifics */ candidate->addr.sin_family = AF_INET; candidate->addr.sin_port = x_port; - candidate->addr.sin_addr.s_addr = msg->pkt->nh.iph->saddr; + candidate->addr.sin_addr.s_addr = pkt->nh.iph->saddr; candidate->in_epoch = x_epoch; candidate->out_epoch = x_epoch; candidate->in_clientflag = RXRPC_CLIENT_INITIATED; @@ -675,6 +676,7 @@ int rxrpc_conn_receive_call_packet(struct rxrpc_connection *conn, struct rxrpc_message *msg) { struct rxrpc_message *pmsg; + struct dst_entry *dst; struct list_head *_p; unsigned cix, seq; int ret = 0; @@ -710,10 +712,10 @@ int rxrpc_conn_receive_call_packet(struct rxrpc_connection *conn, call->pkt_rcv_count++; - if (msg->pkt->dst && msg->pkt->dst->dev) + dst = msg->pkt->dst; + if (dst && dst->dev) conn->peer->if_mtu = - msg->pkt->dst->dev->mtu - - msg->pkt->dst->dev->hard_header_len; + dst->dev->mtu - dst->dev->hard_header_len; /* queue on the call in seq order */ rxrpc_get_message(msg); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 55cd5327fbd7..778b1e5a4b50 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -44,7 +44,7 @@ if NET_SCHED choice prompt "Packet scheduler clock source" - default NET_SCH_CLK_JIFFIES + default NET_SCH_CLK_GETTIMEOFDAY ---help--- Packet schedulers need a monotonic clock that increments at a static rate. The kernel provides several suitable interfaces, each with @@ -411,7 +411,7 @@ config NET_EMATCH_META tristate "Metadata" depends on NET_EMATCH ---help--- - Say Y here if you want to be ablt to classify packets based on + Say Y here if you want to be able to classify packets based on metadata such as load average, netfilter attributes, socket attributes and routing decisions. diff --git a/net/sched/Makefile b/net/sched/Makefile index e48d0d456b3e..0f06aec66094 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -7,13 +7,13 @@ obj-y := sch_generic.o obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o obj-$(CONFIG_NET_CLS) += cls_api.o obj-$(CONFIG_NET_CLS_ACT) += act_api.o -obj-$(CONFIG_NET_ACT_POLICE) += police.o -obj-$(CONFIG_NET_CLS_POLICE) += police.o -obj-$(CONFIG_NET_ACT_GACT) += gact.o -obj-$(CONFIG_NET_ACT_MIRRED) += mirred.o -obj-$(CONFIG_NET_ACT_IPT) += ipt.o -obj-$(CONFIG_NET_ACT_PEDIT) += pedit.o -obj-$(CONFIG_NET_ACT_SIMP) += simple.o +obj-$(CONFIG_NET_ACT_POLICE) += act_police.o +obj-$(CONFIG_NET_CLS_POLICE) += act_police.o +obj-$(CONFIG_NET_ACT_GACT) += act_gact.o +obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o +obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o +obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o +obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2ce1cb2aa2ed..792ce59940ec 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -165,7 +165,7 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act, while ((a = act) != NULL) { repeat: if (a->ops && a->ops->act) { - ret = a->ops->act(&skb, a, res); + ret = a->ops->act(skb, a, res); if (TC_MUNGED & skb->tc_verd) { /* copied already, allow trampling */ skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); @@ -290,7 +290,7 @@ struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est, if (a_o == NULL) { #ifdef CONFIG_KMOD rtnl_unlock(); - request_module(act_name); + request_module("act_%s", act_name); rtnl_lock(); a_o = tc_lookup_action_n(act_name); diff --git a/net/sched/gact.c b/net/sched/act_gact.c index d1c6d542912a..a1e68f78dcc2 100644 --- a/net/sched/gact.c +++ b/net/sched/act_gact.c @@ -135,10 +135,9 @@ tcf_gact_cleanup(struct tc_action *a, int bind) } static int -tcf_gact(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) +tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) { struct tcf_gact *p = PRIV(a, gact); - struct sk_buff *skb = *pskb; int action = TC_ACT_SHOT; spin_lock(&p->lock); diff --git a/net/sched/ipt.c b/net/sched/act_ipt.c index f50136eed211..39a22a3ffe78 100644 --- a/net/sched/ipt.c +++ b/net/sched/act_ipt.c @@ -62,7 +62,7 @@ ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook) struct ipt_target *target; int ret = 0; - target = ipt_find_target(t->u.user.name, t->u.user.revision); + target = xt_find_target(AF_INET, t->u.user.name, t->u.user.revision); if (!target) return -ENOENT; @@ -201,11 +201,10 @@ tcf_ipt_cleanup(struct tc_action *a, int bind) } static int -tcf_ipt(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) +tcf_ipt(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) { int ret = 0, result = 0; struct tcf_ipt *p = PRIV(a, ipt); - struct sk_buff *skb = *pskb; if (skb_cloned(skb)) { if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) @@ -222,6 +221,9 @@ tcf_ipt(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) worry later - danger - this API seems to have changed from earlier kernels */ + /* iptables targets take a double skb pointer in case the skb + * needs to be replaced. We don't own the skb, so this must not + * happen. The pskb_expand_head above should make sure of this */ ret = p->t->u.kernel.target->target(&skb, skb->dev, NULL, p->hook, p->t->data, NULL); switch (ret) { diff --git a/net/sched/mirred.c b/net/sched/act_mirred.c index 20d06916dc0b..4fcccbd50885 100644 --- a/net/sched/mirred.c +++ b/net/sched/act_mirred.c @@ -158,12 +158,11 @@ tcf_mirred_cleanup(struct tc_action *a, int bind) } static int -tcf_mirred(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) +tcf_mirred(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) { struct tcf_mirred *p = PRIV(a, mirred); struct net_device *dev; struct sk_buff *skb2 = NULL; - struct sk_buff *skb = *pskb; u32 at = G_TC_AT(skb->tc_verd); spin_lock(&p->lock); diff --git a/net/sched/pedit.c b/net/sched/act_pedit.c index 767d24f4610e..1742a68e0122 100644 --- a/net/sched/pedit.c +++ b/net/sched/act_pedit.c @@ -130,10 +130,9 @@ tcf_pedit_cleanup(struct tc_action *a, int bind) } static int -tcf_pedit(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) +tcf_pedit(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = PRIV(a, pedit); - struct sk_buff *skb = *pskb; int i, munged = 0; u8 *pptr; @@ -246,10 +245,12 @@ tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,int bind, int ref) t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); t.expires = jiffies_to_clock_t(p->tm.expires); RTA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t); + kfree(opt); return skb->len; rtattr_failure: skb_trim(skb, b - skb->data); + kfree(opt); return -1; } diff --git a/net/sched/police.c b/net/sched/act_police.c index eb39fb2f39b6..fa877f8f652c 100644 --- a/net/sched/police.c +++ b/net/sched/act_police.c @@ -284,11 +284,10 @@ static int tcf_act_police_cleanup(struct tc_action *a, int bind) return 0; } -static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a, +static int tcf_act_police(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) { psched_time_t now; - struct sk_buff *skb = *pskb; struct tcf_police *p = PRIV(a); long toks; long ptoks = 0; @@ -408,7 +407,7 @@ police_cleanup_module(void) module_init(police_init_module); module_exit(police_cleanup_module); -#endif +#else /* CONFIG_NET_CLS_ACT */ struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est) { @@ -545,6 +544,7 @@ int tcf_police(struct sk_buff *skb, struct tcf_police *p) spin_unlock(&p->lock); return p->action; } +EXPORT_SYMBOL(tcf_police); int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p) { @@ -601,13 +601,4 @@ errout: return -1; } - -EXPORT_SYMBOL(tcf_police); -EXPORT_SYMBOL(tcf_police_destroy); -EXPORT_SYMBOL(tcf_police_dump); -EXPORT_SYMBOL(tcf_police_dump_stats); -EXPORT_SYMBOL(tcf_police_hash); -EXPORT_SYMBOL(tcf_police_ht); -EXPORT_SYMBOL(tcf_police_locate); -EXPORT_SYMBOL(tcf_police_lookup); -EXPORT_SYMBOL(tcf_police_new_index); +#endif /* CONFIG_NET_CLS_ACT */ diff --git a/net/sched/simple.c b/net/sched/act_simple.c index 8a6ae4f491e8..e5f2e1f431e2 100644 --- a/net/sched/simple.c +++ b/net/sched/act_simple.c @@ -44,9 +44,8 @@ static DEFINE_RWLOCK(simp_lock); #include <net/pkt_act.h> #include <net/act_generic.h> -static int tcf_simp(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) +static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) { - struct sk_buff *skb = *pskb; struct tcf_defact *p = PRIV(a, defact); spin_lock(&p->lock); diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 64b047c65568..5cb956b721e8 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -92,7 +92,6 @@ #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/pkt_cls.h> -#include <config/net/ematch/stack.h> static LIST_HEAD(ematch_ops); static DEFINE_RWLOCK(ematch_mod_lock); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 09453f997d8c..6cd81708bf71 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -257,7 +257,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) (cl = cbq_class_lookup(q, prio)) != NULL) return cl; - *qerr = NET_XMIT_DROP; + *qerr = NET_XMIT_BYPASS; for (;;) { int result = 0; defmap = head->defaults; @@ -413,7 +413,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->rx_class = cl; #endif if (cl == NULL) { - if (ret == NET_XMIT_DROP) + if (ret == NET_XMIT_BYPASS) sch->qstats.drops++; kfree_skb(skb); return ret; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index c26764bc4103..91132f6871d7 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -208,7 +208,7 @@ struct hfsc_sched do { \ struct timeval tv; \ do_gettimeofday(&tv); \ - (stamp) = 1000000ULL * tv.tv_sec + tv.tv_usec; \ + (stamp) = 1ULL * USEC_PER_SEC * tv.tv_sec + tv.tv_usec; \ } while (0) #endif @@ -502,8 +502,8 @@ d2dx(u32 d) u64 dx; dx = ((u64)d * PSCHED_JIFFIE2US(HZ)); - dx += 1000000 - 1; - do_div(dx, 1000000); + dx += USEC_PER_SEC - 1; + do_div(dx, USEC_PER_SEC); return dx; } @@ -523,7 +523,7 @@ dx2d(u64 dx) { u64 d; - d = dx * 1000000; + d = dx * USEC_PER_SEC; do_div(d, PSCHED_JIFFIE2US(HZ)); return (u32)d; } @@ -1227,7 +1227,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) if (cl->level == 0) return cl; - *qerr = NET_XMIT_DROP; + *qerr = NET_XMIT_BYPASS; tcf = q->root.filter_list; while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { #ifdef CONFIG_NET_CLS_ACT @@ -1643,7 +1643,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl = hfsc_classify(skb, sch, &err); if (cl == NULL) { - if (err == NET_XMIT_DROP) + if (err == NET_XMIT_BYPASS) sch->qstats.drops++; kfree_skb(skb); return err; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 558cc087e602..3ec95df4a85e 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -321,7 +321,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, in if ((cl = htb_find(skb->priority,sch)) != NULL && cl->level == 0) return cl; - *qerr = NET_XMIT_DROP; + *qerr = NET_XMIT_BYPASS; tcf = q->filter_list; while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { #ifdef CONFIG_NET_CLS_ACT @@ -724,7 +724,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } #ifdef CONFIG_NET_CLS_ACT } else if (!cl) { - if (ret == NET_XMIT_DROP) + if (ret == NET_XMIT_BYPASS) sch->qstats.drops++; kfree_skb (skb); return ret; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 82fb07aa06a5..ba5283204837 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -25,7 +25,7 @@ #include <net/pkt_sched.h> -#define VERSION "1.1" +#define VERSION "1.2" /* Network Emulation Queuing algorithm. ==================================== @@ -65,11 +65,12 @@ struct netem_sched_data { u32 jitter; u32 duplicate; u32 reorder; + u32 corrupt; struct crndstate { unsigned long last; unsigned long rho; - } delay_cor, loss_cor, dup_cor, reorder_cor; + } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; struct disttable { u32 size; @@ -183,6 +184,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->duplicate = dupsave; } + /* + * Randomized packet corruption. + * Make copy if needed since we are modifying + * If packet is going to be hardware checksummed, then + * do it now in software before we mangle it. + */ + if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) + || (skb->ip_summed == CHECKSUM_HW + && skb_checksum_help(skb, 0))) { + sch->qstats.drops++; + return NET_XMIT_DROP; + } + + skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); + } + if (q->gap == 0 /* not doing reordering */ || q->counter < q->gap /* inside last reordering gap */ || q->reorder < get_crandom(&q->reorder_cor)) { @@ -382,6 +400,20 @@ static int get_reorder(struct Qdisc *sch, const struct rtattr *attr) return 0; } +static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_corrupt *r = RTA_DATA(attr); + + if (RTA_PAYLOAD(attr) != sizeof(*r)) + return -EINVAL; + + q->corrupt = r->probability; + init_crandom(&q->corrupt_cor, r->correlation); + return 0; +} + +/* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct rtattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); @@ -432,13 +464,19 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) if (ret) return ret; } + if (tb[TCA_NETEM_REORDER-1]) { ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]); if (ret) return ret; } - } + if (tb[TCA_NETEM_CORRUPT-1]) { + ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]); + if (ret) + return ret; + } + } return 0; } @@ -564,6 +602,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_qopt qopt; struct tc_netem_corr cor; struct tc_netem_reorder reorder; + struct tc_netem_corrupt corrupt; qopt.latency = q->latency; qopt.jitter = q->jitter; @@ -582,6 +621,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) reorder.correlation = q->reorder_cor.rho; RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + corrupt.probability = q->corrupt; + corrupt.correlation = q->corrupt_cor.rho; + RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + rta->rta_len = skb->tail - b; return skb->len; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 3ac0f495bad0..5b3a3e48ed92 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -54,7 +54,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) u32 band = skb->priority; struct tcf_result res; - *qerr = NET_XMIT_DROP; + *qerr = NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { #ifdef CONFIG_NET_CLS_ACT switch (tc_classify(skb, q->filter_list, &res)) { @@ -91,7 +91,8 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) qdisc = prio_classify(skb, sch, &ret); #ifdef CONFIG_NET_CLS_ACT if (qdisc == NULL) { - if (ret == NET_XMIT_DROP) + + if (ret == NET_XMIT_BYPASS) sch->qstats.drops++; kfree_skb(skb); return ret; @@ -118,7 +119,7 @@ prio_requeue(struct sk_buff *skb, struct Qdisc* sch) qdisc = prio_classify(skb, sch, &ret); #ifdef CONFIG_NET_CLS_ACT if (qdisc == NULL) { - if (ret == NET_XMIT_DROP) + if (ret == NET_XMIT_BYPASS) sch->qstats.drops++; kfree_skb(skb); return ret; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 6cf0342706b5..79b8ef34c6e4 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -22,6 +22,7 @@ #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> +#include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/inet.h> #include <linux/netdevice.h> @@ -273,7 +274,7 @@ teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *de static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev) { - struct teql_master *master = (void*)dev->priv; + struct teql_master *master = netdev_priv(dev); struct Qdisc *start, *q; int busy; int nores; @@ -349,7 +350,7 @@ drop: static int teql_master_open(struct net_device *dev) { struct Qdisc * q; - struct teql_master *m = (void*)dev->priv; + struct teql_master *m = netdev_priv(dev); int mtu = 0xFFFE; unsigned flags = IFF_NOARP|IFF_MULTICAST; @@ -396,13 +397,13 @@ static int teql_master_close(struct net_device *dev) static struct net_device_stats *teql_master_stats(struct net_device *dev) { - struct teql_master *m = (void*)dev->priv; + struct teql_master *m = netdev_priv(dev); return &m->stats; } static int teql_master_mtu(struct net_device *dev, int new_mtu) { - struct teql_master *m = (void*)dev->priv; + struct teql_master *m = netdev_priv(dev); struct Qdisc *q; if (new_mtu < 68) @@ -422,7 +423,7 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu) static __init void teql_master_setup(struct net_device *dev) { - struct teql_master *master = dev->priv; + struct teql_master *master = netdev_priv(dev); struct Qdisc_ops *ops = &master->qops; master->dev = dev; @@ -475,7 +476,7 @@ static int __init teql_init(void) break; } - master = dev->priv; + master = netdev_priv(dev); strlcpy(master->qops.id, dev->name, IFNAMSIZ); err = register_qdisc(&master->qops); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index dec68a604773..9d05e13e92f6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -110,7 +110,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000; asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000) * 1000; - asoc->pmtu = 0; asoc->frag_point = 0; /* Set the association max_retrans and RTO values from the @@ -123,6 +122,25 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->overall_error_count = 0; + /* Initialize the association's heartbeat interval based on the + * sock configured value. + */ + asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); + + /* Initialize path max retrans value. */ + asoc->pathmaxrxt = sp->pathmaxrxt; + + /* Initialize default path MTU. */ + asoc->pathmtu = sp->pathmtu; + + /* Set association default SACK delay */ + asoc->sackdelay = msecs_to_jiffies(sp->sackdelay); + + /* Set the association default flags controlling + * Heartbeat, SACK delay, and Path MTU Discovery. + */ + asoc->param_flags = sp->param_flags; + /* Initialize the maximum mumber of new data packets that can be sent * in a burst. */ @@ -144,8 +162,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; - asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = - SCTP_DEFAULT_TIMEOUT_SACK; + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; @@ -540,23 +557,46 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, sctp_transport_set_owner(peer, asoc); + /* Initialize the peer's heartbeat interval based on the + * association configured value. + */ + peer->hbinterval = asoc->hbinterval; + + /* Set the path max_retrans. */ + peer->pathmaxrxt = asoc->pathmaxrxt; + + /* Initialize the peer's SACK delay timeout based on the + * association configured value. + */ + peer->sackdelay = asoc->sackdelay; + + /* Enable/disable heartbeat, SACK delay, and path MTU discovery + * based on association setting. + */ + peer->param_flags = asoc->param_flags; + /* Initialize the pmtu of the transport. */ - sctp_transport_pmtu(peer); + if (peer->param_flags & SPP_PMTUD_ENABLE) + sctp_transport_pmtu(peer); + else if (asoc->pathmtu) + peer->pathmtu = asoc->pathmtu; + else + peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT; /* If this is the first transport addr on this association, * initialize the association PMTU to the peer's PMTU. * If not and the current association PMTU is higher than the new * peer's PMTU, reset the association PMTU to the new peer's PMTU. */ - if (asoc->pmtu) - asoc->pmtu = min_t(int, peer->pmtu, asoc->pmtu); + if (asoc->pathmtu) + asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu); else - asoc->pmtu = peer->pmtu; + asoc->pathmtu = peer->pathmtu; SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to " - "%d\n", asoc, asoc->pmtu); + "%d\n", asoc, asoc->pathmtu); - asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu); /* The asoc->peer.port might not be meaningful yet, but * initialize the packet structure anyway. @@ -574,7 +614,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, * (for example, implementations MAY use the size of the * receiver advertised window). */ - peer->cwnd = min(4*asoc->pmtu, max_t(__u32, 2*asoc->pmtu, 4380)); + peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); /* At this point, we may not have the receiver's advertised window, * so initialize ssthresh to the default value and it will be set @@ -585,17 +625,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, peer->partial_bytes_acked = 0; peer->flight_size = 0; - /* By default, enable heartbeat for peer address. */ - peer->hb_allowed = 1; - - /* Initialize the peer's heartbeat interval based on the - * sock configured value. - */ - peer->hb_interval = msecs_to_jiffies(sp->paddrparam.spp_hbinterval); - - /* Set the path max_retrans. */ - peer->max_retrans = sp->paddrparam.spp_pathmaxrxt; - /* Set the transport's RTO.initial value */ peer->rto = asoc->rto_initial; @@ -1155,18 +1184,18 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) /* Get the lowest pmtu of all the transports. */ list_for_each(pos, &asoc->peer.transport_addr_list) { t = list_entry(pos, struct sctp_transport, transports); - if (!pmtu || (t->pmtu < pmtu)) - pmtu = t->pmtu; + if (!pmtu || (t->pathmtu < pmtu)) + pmtu = t->pathmtu; } if (pmtu) { struct sctp_sock *sp = sctp_sk(asoc->base.sk); - asoc->pmtu = pmtu; + asoc->pathmtu = pmtu; asoc->frag_point = sctp_frag_point(sp, pmtu); } SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n", - __FUNCTION__, asoc, asoc->pmtu, asoc->frag_point); + __FUNCTION__, asoc, asoc->pathmtu, asoc->frag_point); } /* Should we send a SACK to update our peer? */ @@ -1179,7 +1208,7 @@ static inline int sctp_peer_needs_update(struct sctp_association *asoc) case SCTP_STATE_SHUTDOWN_SENT: if ((asoc->rwnd > asoc->a_rwnd) && ((asoc->rwnd - asoc->a_rwnd) >= - min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pmtu))) + min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pathmtu))) return 1; break; default: diff --git a/net/sctp/input.c b/net/sctp/input.c index b24ff2c1aef5..4aa6fc60357c 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -225,6 +225,7 @@ int sctp_rcv(struct sk_buff *skb) if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) goto discard_release; + nf_reset(skb); ret = sk_filter(sk, skb, 1); if (ret) @@ -305,18 +306,36 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { - if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { - printk(KERN_WARNING "%s: Reported pmtu %d too low, " - "using default minimum of %d\n", __FUNCTION__, pmtu, - SCTP_DEFAULT_MINSEGMENT); - pmtu = SCTP_DEFAULT_MINSEGMENT; - } + if (sock_owned_by_user(sk) || !t || (t->pathmtu == pmtu)) + return; - if (!sock_owned_by_user(sk) && t && (t->pmtu != pmtu)) { - t->pmtu = pmtu; + if (t->param_flags & SPP_PMTUD_ENABLE) { + if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { + printk(KERN_WARNING "%s: Reported pmtu %d too low, " + "using default minimum of %d\n", + __FUNCTION__, pmtu, + SCTP_DEFAULT_MINSEGMENT); + /* Use default minimum segment size and disable + * pmtu discovery on this transport. + */ + t->pathmtu = SCTP_DEFAULT_MINSEGMENT; + t->param_flags = (t->param_flags & ~SPP_HB) | + SPP_PMTUD_DISABLE; + } else { + t->pathmtu = pmtu; + } + + /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); - sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } + + /* Retransmit with the new pmtu setting. + * Normally, if PMTU discovery is disabled, an ICMP Fragmentation + * Needed will never be sent, but if a message was sent before + * PMTU discovery was disabled that was larger than the PMTU, it + * would not be fragmented, so it must be re-transmitted fragmented. + */ + sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } /* diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index fa3be2b8fb5f..2e266129a764 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -180,8 +180,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport, } SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, " - "src:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x " - "dst:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + "src:" NIP6_FMT " dst:" NIP6_FMT "\n", __FUNCTION__, skb, skb->len, NIP6(fl.fl6_src), NIP6(fl.fl6_dst)); @@ -206,13 +205,13 @@ static struct dst_entry *sctp_v6_get_dst(struct sctp_association *asoc, fl.oif = daddr->v6.sin6_scope_id; - SCTP_DEBUG_PRINTK("%s: DST=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", + SCTP_DEBUG_PRINTK("%s: DST=" NIP6_FMT " ", __FUNCTION__, NIP6(fl.fl6_dst)); if (saddr) { ipv6_addr_copy(&fl.fl6_src, &saddr->v6.sin6_addr); SCTP_DEBUG_PRINTK( - "SRC=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x - ", + "SRC=" NIP6_FMT " - ", NIP6(fl.fl6_src)); } @@ -221,8 +220,7 @@ static struct dst_entry *sctp_v6_get_dst(struct sctp_association *asoc, struct rt6_info *rt; rt = (struct rt6_info *)dst; SCTP_DEBUG_PRINTK( - "rt6_dst:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x " - "rt6_src:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + "rt6_dst:" NIP6_FMT " rt6_src:" NIP6_FMT "\n", NIP6(rt->rt6i_dst.addr), NIP6(rt->rt6i_src.addr)); } else { SCTP_DEBUG_PRINTK("NO ROUTE\n"); @@ -271,13 +269,12 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc, __u8 bmatchlen; SCTP_DEBUG_PRINTK("%s: asoc:%p dst:%p " - "daddr:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", + "daddr:" NIP6_FMT " ", __FUNCTION__, asoc, dst, NIP6(daddr->v6.sin6_addr)); if (!asoc) { ipv6_get_saddr(dst, &daddr->v6.sin6_addr,&saddr->v6.sin6_addr); - SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: " - "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: " NIP6_FMT "\n", NIP6(saddr->v6.sin6_addr)); return; } @@ -305,13 +302,11 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc, if (baddr) { memcpy(saddr, baddr, sizeof(union sctp_addr)); - SCTP_DEBUG_PRINTK("saddr: " - "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + SCTP_DEBUG_PRINTK("saddr: " NIP6_FMT "\n", NIP6(saddr->v6.sin6_addr)); } else { printk(KERN_ERR "%s: asoc:%p Could not find a valid source " - "address for the " - "dest:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + "address for the dest:" NIP6_FMT "\n", __FUNCTION__, asoc, NIP6(daddr->v6.sin6_addr)); } @@ -675,8 +670,7 @@ static int sctp_v6_is_ce(const struct sk_buff *skb) /* Dump the v6 addr to the seq file. */ static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) { - seq_printf(seq, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", - NIP6(addr->v6.sin6_addr)); + seq_printf(seq, NIP6_FMT " ", NIP6(addr->v6.sin6_addr)); } /* Initialize a PF_INET6 socket msg_name. */ @@ -866,7 +860,7 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, return 2; } -static struct proto_ops inet6_seqpacket_ops = { +static const struct proto_ops inet6_seqpacket_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -905,7 +899,7 @@ static struct inet_protosw sctpv6_stream_protosw = { .flags = SCTP_PROTOSW_FLAG, }; -static int sctp6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +static int sctp6_rcv(struct sk_buff **pskb) { return sctp_rcv(*pskb) ? -1 : 0; } diff --git a/net/sctp/output.c b/net/sctp/output.c index 931371633464..a40991ef72c9 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -234,8 +234,8 @@ sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet, goto finish; pmtu = ((packet->transport->asoc) ? - (packet->transport->asoc->pmtu) : - (packet->transport->pmtu)); + (packet->transport->asoc->pathmtu) : + (packet->transport->pathmtu)); too_big = (psize + chunk_len > pmtu); @@ -482,7 +482,9 @@ int sctp_packet_transmit(struct sctp_packet *packet) if (!dst || (dst->obsolete > 1)) { dst_release(dst); sctp_transport_route(tp, NULL, sctp_sk(sk)); - sctp_assoc_sync_pmtu(asoc); + if (asoc->param_flags & SPP_PMTUD_ENABLE) { + sctp_assoc_sync_pmtu(asoc); + } } nskb->dst = dst_clone(tp->dst); @@ -492,7 +494,10 @@ int sctp_packet_transmit(struct sctp_packet *packet) SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", nskb->len); - (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); + if (tp->param_flags & SPP_PMTUD_ENABLE) + (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); + else + (*tp->af_specific->sctp_xmit)(nskb, tp, 1); out: packet->size = packet->overhead; @@ -577,7 +582,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, * if ((flightsize + Max.Burst * MTU) < cwnd) * cwnd = flightsize + Max.Burst * MTU */ - max_burst_bytes = asoc->max_burst * asoc->pmtu; + max_burst_bytes = asoc->max_burst * asoc->pathmtu; if ((transport->flight_size + max_burst_bytes) < transport->cwnd) { transport->cwnd = transport->flight_size + max_burst_bytes; SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: " @@ -622,7 +627,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, * data will fit or delay in hopes of bundling a full * sized packet. */ - if (len < asoc->pmtu - packet->overhead) { + if (len < asoc->pathmtu - packet->overhead) { retval = SCTP_XMIT_NAGLE_DELAY; goto finish; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index f775d78aa59d..de693b43c8ea 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -54,6 +54,7 @@ #include <net/protocol.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/route.h> #include <net/sctp/sctp.h> #include <net/addrconf.h> #include <net/inet_common.h> @@ -829,7 +830,7 @@ static struct notifier_block sctp_inetaddr_notifier = { }; /* Socket operations. */ -static struct proto_ops inet_seqpacket_ops = { +static const struct proto_ops inet_seqpacket_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, /* Needs to be wrapped... */ diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f9573eba5c7a..556c495c6922 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1287,7 +1287,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, - (bodysize % SCTP_COOKIE_MULTIPLE); *cookie_len = headersize + bodysize; - retval = (sctp_cookie_param_t *)kmalloc(*cookie_len, GFP_ATOMIC); + retval = kmalloc(*cookie_len, GFP_ATOMIC); if (!retval) { *cookie_len = 0; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 823947170a33..b8b38aba92b3 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -157,9 +157,12 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, { __u32 ctsn, max_tsn_seen; struct sctp_chunk *sack; + struct sctp_transport *trans = asoc->peer.last_data_from; int error = 0; - if (force) + if (force || + (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) || + (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE))) asoc->peer.sack_needed = 1; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); @@ -189,7 +192,22 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, if (!asoc->peer.sack_needed) { /* We will need a SACK for the next packet. */ asoc->peer.sack_needed = 1; - goto out; + + /* Set the SACK delay timeout based on the + * SACK delay for the last transport + * data was received from, or the default + * for the association. + */ + if (trans) + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + trans->sackdelay; + else + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + asoc->sackdelay; + + /* Restart the SACK timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } else { if (asoc->a_rwnd > asoc->rwnd) asoc->a_rwnd = asoc->rwnd; @@ -205,7 +223,7 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } -out: + return error; nomem: error = -ENOMEM; @@ -415,7 +433,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, asoc->overall_error_count++; if (transport->state != SCTP_INACTIVE && - (transport->error_count++ >= transport->max_retrans)) { + (transport->error_count++ >= transport->pathmaxrxt)) { SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p", " transport IP: port:%d failed.\n", asoc, @@ -1232,8 +1250,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_TIMER_START: timer = &asoc->timers[cmd->obj.to]; timeout = asoc->timeouts[cmd->obj.to]; - if (!timeout) - BUG(); + BUG_ON(!timeout); timer->expires = jiffies + timeout; sctp_association_hold(asoc); diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 475bfb4972d9..477d7f80dba6 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -900,7 +900,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep, * HEARTBEAT is sent (see Section 8.3). */ - if (transport->hb_allowed) { + if (transport->param_flags & SPP_HB_ENABLE) { if (SCTP_DISPOSITION_NOMEM == sctp_sf_heartbeat(ep, asoc, type, arg, commands)) @@ -1036,14 +1036,14 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep, if (from_addr.sa.sa_family == AF_INET6) { printk(KERN_WARNING "%s association %p could not find address " - "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + NIP6_FMT "\n", __FUNCTION__, asoc, NIP6(from_addr.v6.sin6_addr)); } else { printk(KERN_WARNING "%s association %p could not find address " - "%u.%u.%u.%u\n", + NIPQUAD_FMT "\n", __FUNCTION__, asoc, NIPQUAD(from_addr.v4.sin_addr.s_addr)); @@ -1051,7 +1051,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep, return SCTP_DISPOSITION_DISCARD; } - max_interval = link->hb_interval + link->rto; + max_interval = link->hbinterval + link->rto; /* Check if the timestamp looks valid. */ if (time_after(hbinfo->sent_at, jiffies) || @@ -2691,14 +2691,9 @@ sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep, * document allow. However, an SCTP transmitter MUST NOT be * more aggressive than the following algorithms allow. */ - if (chunk->end_of_packet) { + if (chunk->end_of_packet) sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); - } - return SCTP_DISPOSITION_CONSUME; discard_force: @@ -2721,13 +2716,9 @@ discard_force: return SCTP_DISPOSITION_DISCARD; discard_noforce: - if (chunk->end_of_packet) { + if (chunk->end_of_packet) sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); - } return SCTP_DISPOSITION_DISCARD; consume: return SCTP_DISPOSITION_CONSUME; @@ -3442,9 +3433,6 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep, * send another. */ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); return SCTP_DISPOSITION_CONSUME; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9df888e932c5..c98ee375ba5e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -63,6 +63,7 @@ #include <linux/wait.h> #include <linux/time.h> #include <linux/ip.h> +#include <linux/capability.h> #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/init.h> @@ -860,7 +861,7 @@ SCTP_STATIC int sctp_setsockopt_bindx(struct sock* sk, return -EFAULT; /* Alloc space for the address array in kernel memory. */ - kaddrs = (struct sockaddr *)kmalloc(addrs_size, GFP_KERNEL); + kaddrs = kmalloc(addrs_size, GFP_KERNEL); if (unlikely(!kaddrs)) return -ENOMEM; @@ -1150,7 +1151,7 @@ SCTP_STATIC int sctp_setsockopt_connectx(struct sock* sk, return -EFAULT; /* Alloc space for the address array in kernel memory. */ - kaddrs = (struct sockaddr *)kmalloc(addrs_size, GFP_KERNEL); + kaddrs = kmalloc(addrs_size, GFP_KERNEL); if (unlikely(!kaddrs)) return -ENOMEM; @@ -1941,107 +1942,379 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * address's parameters: * * struct sctp_paddrparams { - * sctp_assoc_t spp_assoc_id; - * struct sockaddr_storage spp_address; - * uint32_t spp_hbinterval; - * uint16_t spp_pathmaxrxt; - * }; - * - * spp_assoc_id - (UDP style socket) This is filled in the application, - * and identifies the association for this query. + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, - * in milliseconds. A value of 0, when modifying the - * parameter, specifies that the heartbeat on this - * address should be disabled. A value of UINT32_MAX - * (4294967295), when modifying the parameter, - * specifies that a heartbeat should be sent - * immediately to the peer address, and the current - * interval should remain unchanged. + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be - * considered unreachable. + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. */ +int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, + struct sctp_transport *trans, + struct sctp_association *asoc, + struct sctp_sock *sp, + int hb_change, + int pmtud_change, + int sackdelay_change) +{ + int error; + + if (params->spp_flags & SPP_HB_DEMAND && trans) { + error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); + if (error) + return error; + } + + if (params->spp_hbinterval) { + if (trans) { + trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + } else if (asoc) { + asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + } else { + sp->hbinterval = params->spp_hbinterval; + } + } + + if (hb_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_HB) | hb_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_HB) | hb_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_HB) | hb_change; + } + } + + if (params->spp_pathmtu) { + if (trans) { + trans->pathmtu = params->spp_pathmtu; + sctp_assoc_sync_pmtu(asoc); + } else if (asoc) { + asoc->pathmtu = params->spp_pathmtu; + sctp_frag_point(sp, params->spp_pathmtu); + } else { + sp->pathmtu = params->spp_pathmtu; + } + } + + if (pmtud_change) { + if (trans) { + int update = (trans->param_flags & SPP_PMTUD_DISABLE) && + (params->spp_flags & SPP_PMTUD_ENABLE); + trans->param_flags = + (trans->param_flags & ~SPP_PMTUD) | pmtud_change; + if (update) { + sctp_transport_pmtu(trans); + sctp_assoc_sync_pmtu(asoc); + } + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_PMTUD) | pmtud_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_PMTUD) | pmtud_change; + } + } + + if (params->spp_sackdelay) { + if (trans) { + trans->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else { + sp->sackdelay = params->spp_sackdelay; + } + } + + if (sackdelay_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } + } + + if (params->spp_pathmaxrxt) { + if (trans) { + trans->pathmaxrxt = params->spp_pathmaxrxt; + } else if (asoc) { + asoc->pathmaxrxt = params->spp_pathmaxrxt; + } else { + sp->pathmaxrxt = params->spp_pathmaxrxt; + } + } + + return 0; +} + static int sctp_setsockopt_peer_addr_params(struct sock *sk, char __user *optval, int optlen) { - struct sctp_paddrparams params; - struct sctp_transport *trans; + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); int error; + int hb_change, pmtud_change, sackdelay_change; if (optlen != sizeof(struct sctp_paddrparams)) - return -EINVAL; + return - EINVAL; + if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; - /* - * API 7. Socket Options (setting the default value for the endpoint) - * All options that support specific settings on an association by - * filling in either an association id variable or a sockaddr_storage - * SHOULD also support setting of the same value for the entire endpoint - * (i.e. future associations). To accomplish this the following logic is - * used when setting one of these options: - - * c) If neither the sockaddr_storage or association identification is - * set i.e. the sockaddr_storage is set to all 0's (INADDR_ANY) and - * the association identification is 0, the settings are a default - * and to be applied to the endpoint (all future associations). - */ + /* Validate flags and value parameters. */ + hb_change = params.spp_flags & SPP_HB; + pmtud_change = params.spp_flags & SPP_PMTUD; + sackdelay_change = params.spp_flags & SPP_SACKDELAY; + + if (hb_change == SPP_HB || + pmtud_change == SPP_PMTUD || + sackdelay_change == SPP_SACKDELAY || + params.spp_sackdelay > 500 || + (params.spp_pathmtu + && params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) + return -EINVAL; - /* update default value for endpoint (all future associations) */ - if (!params.spp_assoc_id && - sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { - /* Manual heartbeat on an endpoint is invalid. */ - if (0xffffffff == params.spp_hbinterval) + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) return -EINVAL; - else if (params.spp_hbinterval) - sctp_sk(sk)->paddrparam.spp_hbinterval = - params.spp_hbinterval; - if (params.spp_pathmaxrxt) - sctp_sk(sk)->paddrparam.spp_pathmaxrxt = - params.spp_pathmaxrxt; - return 0; } - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); - if (!trans) + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - /* Applications can enable or disable heartbeats for any peer address - * of an association, modify an address's heartbeat interval, force a - * heartbeat to be sent immediately, and adjust the address's maximum - * number of retransmissions sent before an address is considered - * unreachable. - * - * The value of the heartbeat interval, in milliseconds. A value of - * UINT32_MAX (4294967295), when modifying the parameter, specifies - * that a heartbeat should be sent immediately to the peer address, - * and the current interval should remain unchanged. + /* Heartbeat demand can only be sent on a transport or + * association, but not a socket. */ - if (0xffffffff == params.spp_hbinterval) { - error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); - if (error) - return error; - } else { - /* The value of the heartbeat interval, in milliseconds. A value of 0, - * when modifying the parameter, specifies that the heartbeat on this - * address should be disabled. + if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc) + return -EINVAL; + + /* Process parameters. */ + error = sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); + + if (error) + return error; + + /* If changes are for association, also apply parameters to each + * transport. */ - if (params.spp_hbinterval) { - trans->hb_allowed = 1; - trans->hb_interval = - msecs_to_jiffies(params.spp_hbinterval); - } else - trans->hb_allowed = 0; + if (!trans && asoc) { + struct list_head *pos; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, + transports); + sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); + } } - /* spp_pathmaxrxt contains the maximum number of retransmissions - * before this address shall be considered unreachable. - */ - if (params.spp_pathmaxrxt) - trans->max_retrans = params.spp_pathmaxrxt; + return 0; +} + +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id - This parameter, indicates which association the + * user is preforming an action upon. Note that if + * this field's value is zero then the endpoints + * default value is changed (effecting future + * associations only). + * + * assoc_value - This parameter contains the number of milliseconds + * that the user is requesting the delayed ACK timer + * be set to. Note that this value is defined in + * the standard to be between 200 and 500 milliseconds. + * + * Note: a value of zero will leave the value alone, + * but disable SACK delay. A non-zero value will also + * enable SACK delay. + */ +static int sctp_setsockopt_delayed_ack_time(struct sock *sk, + char __user *optval, int optlen) +{ + struct sctp_assoc_value params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen != sizeof(struct sctp_assoc_value)) + return - EINVAL; + + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + + /* Validate value parameter. */ + if (params.assoc_value > 500) + return -EINVAL; + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (params.assoc_value) { + if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params.assoc_value); + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + sp->sackdelay = params.assoc_value; + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } + } else { + if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } + } + + /* If change is for association, also apply to each transport. */ + if (asoc) { + struct list_head *pos; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, + transports); + if (params.assoc_value) { + trans->sackdelay = + msecs_to_jiffies(params.assoc_value); + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } + } + } + return 0; } @@ -2334,7 +2607,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl /* Update the frag_point of the existing associations. */ list_for_each(pos, &(sp->ep->asocs)) { asoc = list_entry(pos, struct sctp_association, asocs); - asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu); } return 0; @@ -2491,6 +2764,10 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen); break; + case SCTP_DELAYED_ACK_TIME: + retval = sctp_setsockopt_delayed_ack_time(sk, optval, optlen); + break; + case SCTP_INITMSG: retval = sctp_setsockopt_initmsg(sk, optval, optlen); break; @@ -2715,8 +2992,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) /* Default Peer Address Parameters. These defaults can * be modified via SCTP_PEER_ADDR_PARAMS */ - sp->paddrparam.spp_hbinterval = jiffies_to_msecs(sctp_hb_interval); - sp->paddrparam.spp_pathmaxrxt = sctp_max_retrans_path; + sp->hbinterval = jiffies_to_msecs(sctp_hb_interval); + sp->pathmaxrxt = sctp_max_retrans_path; + sp->pathmtu = 0; // allow default discovery + sp->sackdelay = sctp_sack_timeout; + sp->param_flags = SPP_HB_ENABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; /* If enabled no SCTP message fragmentation will be performed. * Configure through SCTP_DISABLE_FRAGMENTS socket option. @@ -2865,7 +3147,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, status.sstat_primary.spinfo_cwnd = transport->cwnd; status.sstat_primary.spinfo_srtt = transport->srtt; status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); - status.sstat_primary.spinfo_mtu = transport->pmtu; + status.sstat_primary.spinfo_mtu = transport->pathmtu; if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN) status.sstat_primary.spinfo_state = SCTP_ACTIVE; @@ -2924,7 +3206,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, pinfo.spinfo_cwnd = transport->cwnd; pinfo.spinfo_srtt = transport->srtt; pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); - pinfo.spinfo_mtu = transport->pmtu; + pinfo.spinfo_mtu = transport->pathmtu; if (pinfo.spinfo_state == SCTP_UNKNOWN) pinfo.spinfo_state = SCTP_ACTIVE; @@ -3086,69 +3368,227 @@ out: * address's parameters: * * struct sctp_paddrparams { - * sctp_assoc_t spp_assoc_id; - * struct sockaddr_storage spp_address; - * uint32_t spp_hbinterval; - * uint16_t spp_pathmaxrxt; - * }; - * - * spp_assoc_id - (UDP style socket) This is filled in the application, - * and identifies the association for this query. + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, - * in milliseconds. A value of 0, when modifying the - * parameter, specifies that the heartbeat on this - * address should be disabled. A value of UINT32_MAX - * (4294967295), when modifying the parameter, - * specifies that a heartbeat should be sent - * immediately to the peer address, and the current - * interval should remain unchanged. + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be - * considered unreachable. + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. */ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen) { - struct sctp_paddrparams params; - struct sctp_transport *trans; + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); if (len != sizeof(struct sctp_paddrparams)) return -EINVAL; + if (copy_from_user(¶ms, optval, len)) return -EFAULT; - /* If no association id is specified retrieve the default value - * for the endpoint that will be used for all future associations + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. */ - if (!params.spp_assoc_id && - sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { - params.spp_hbinterval = sctp_sk(sk)->paddrparam.spp_hbinterval; - params.spp_pathmaxrxt = sctp_sk(sk)->paddrparam.spp_pathmaxrxt; - - goto done; + if (!sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) { + SCTP_DEBUG_PRINTK("Failed no transport\n"); + return -EINVAL; + } } - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); - if (!trans) + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) { + SCTP_DEBUG_PRINTK("Failed no association\n"); return -EINVAL; + } - /* The value of the heartbeat interval, in milliseconds. A value of 0, - * when modifying the parameter, specifies that the heartbeat on this - * address should be disabled. - */ - if (!trans->hb_allowed) - params.spp_hbinterval = 0; - else - params.spp_hbinterval = jiffies_to_msecs(trans->hb_interval); + if (trans) { + /* Fetch transport values. */ + params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval); + params.spp_pathmtu = trans->pathmtu; + params.spp_pathmaxrxt = trans->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = trans->param_flags; + } else if (asoc) { + /* Fetch association values. */ + params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); + params.spp_pathmtu = asoc->pathmtu; + params.spp_pathmaxrxt = asoc->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = asoc->param_flags; + } else { + /* Fetch socket values. */ + params.spp_hbinterval = sp->hbinterval; + params.spp_pathmtu = sp->pathmtu; + params.spp_sackdelay = sp->sackdelay; + params.spp_pathmaxrxt = sp->pathmaxrxt; + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = sp->param_flags; + } - /* spp_pathmaxrxt contains the maximum number of retransmissions - * before this address shall be considered unreachable. - */ - params.spp_pathmaxrxt = trans->max_retrans; + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id - This parameter, indicates which association the + * user is preforming an action upon. Note that if + * this field's value is zero then the endpoints + * default value is changed (effecting future + * associations only). + * + * assoc_value - This parameter contains the number of milliseconds + * that the user is requesting the delayed ACK timer + * be set to. Note that this value is defined in + * the standard to be between 200 and 500 milliseconds. + * + * Note: a value of zero will leave the value alone, + * but disable SACK delay. A non-zero value will also + * enable SACK delay. + */ +static int sctp_getsockopt_delayed_ack_time(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (len != sizeof(struct sctp_assoc_value)) + return - EINVAL; + + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + /* Fetch association values. */ + if (asoc->param_flags & SPP_SACKDELAY_ENABLE) + params.assoc_value = jiffies_to_msecs( + asoc->sackdelay); + else + params.assoc_value = 0; + } else { + /* Fetch socket values. */ + if (sp->param_flags & SPP_SACKDELAY_ENABLE) + params.assoc_value = sp->sackdelay; + else + params.assoc_value = 0; + } -done: if (copy_to_user(optval, ¶ms, len)) return -EFAULT; @@ -4015,6 +4455,10 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_peer_addr_params(sk, len, optval, optlen); break; + case SCTP_DELAYED_ACK_TIME: + retval = sctp_getsockopt_delayed_ack_time(sk, len, optval, + optlen); + break; case SCTP_INITMSG: retval = sctp_getsockopt_initmsg(sk, len, optval, optlen); break; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 268ddaf2dc0f..68d73e2dd155 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -86,10 +86,13 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, peer->init_sent_count = 0; peer->state = SCTP_ACTIVE; - peer->hb_allowed = 0; + peer->param_flags = SPP_HB_DISABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; + peer->hbinterval = 0; /* Initialize the default path max_retrans. */ - peer->max_retrans = sctp_max_retrans_path; + peer->pathmaxrxt = sctp_max_retrans_path; peer->error_count = 0; INIT_LIST_HEAD(&peer->transmitted); @@ -229,10 +232,10 @@ void sctp_transport_pmtu(struct sctp_transport *transport) dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL); if (dst) { - transport->pmtu = dst_mtu(dst); + transport->pathmtu = dst_mtu(dst); dst_release(dst); } else - transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } /* Caches the dst entry and source address for a transport's destination @@ -254,8 +257,11 @@ void sctp_transport_route(struct sctp_transport *transport, af->get_saddr(asoc, dst, daddr, &transport->saddr); transport->dst = dst; + if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) { + return; + } if (dst) { - transport->pmtu = dst_mtu(dst); + transport->pathmtu = dst_mtu(dst); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). @@ -264,7 +270,7 @@ void sctp_transport_route(struct sctp_transport *transport, opt->pf->af->to_sk_saddr(&transport->saddr, asoc->base.sk); } else - transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } /* Hold a reference to a transport. */ @@ -369,7 +375,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport, ssthresh = transport->ssthresh; pba = transport->partial_bytes_acked; - pmtu = transport->asoc->pmtu; + pmtu = transport->asoc->pathmtu; if (cwnd <= ssthresh) { /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less @@ -441,8 +447,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); - transport->cwnd = transport->asoc->pmtu; + 4*transport->asoc->pathmtu); + transport->cwnd = transport->asoc->pathmtu; break; case SCTP_LOWER_CWND_FAST_RTX: @@ -459,7 +465,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; break; @@ -479,7 +485,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, if ((jiffies - transport->last_time_ecne_reduced) > transport->rtt) { transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; transport->last_time_ecne_reduced = jiffies; } @@ -496,7 +502,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, */ if ((jiffies - transport->last_time_used) > transport->rto) transport->cwnd = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); break; }; @@ -511,7 +517,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, unsigned long sctp_transport_timeout(struct sctp_transport *t) { unsigned long timeout; - timeout = t->hb_interval + t->rto + sctp_jitter(t->rto); + timeout = t->hbinterval + t->rto + sctp_jitter(t->rto); timeout += jiffies; return timeout; } diff --git a/net/socket.c b/net/socket.c index 3145103cdf54..b38a263853c3 100644 --- a/net/socket.c +++ b/net/socket.c @@ -640,154 +640,150 @@ static void sock_aio_dtor(struct kiocb *iocb) kfree(iocb->private); } -/* - * Read data from a socket. ubuf is a user mode pointer. We make sure the user - * area ubuf...ubuf+size-1 is writable before asking the protocol. - */ - -static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf, - size_t size, loff_t pos) +static ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more) { - struct sock_iocb *x, siocb; struct socket *sock; int flags; - if (pos != 0) - return -ESPIPE; - if (size==0) /* Match SYS5 behaviour */ - return 0; + sock = file->private_data; - if (is_sync_kiocb(iocb)) - x = &siocb; - else { - x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); - if (!x) - return -ENOMEM; + flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + if (more) + flags |= MSG_MORE; + + return sock->ops->sendpage(sock, page, offset, size, flags); +} + +static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, + char __user *ubuf, size_t size, struct sock_iocb *siocb) +{ + if (!is_sync_kiocb(iocb)) { + siocb = kmalloc(sizeof(*siocb), GFP_KERNEL); + if (!siocb) + return NULL; iocb->ki_dtor = sock_aio_dtor; } - iocb->private = x; - x->kiocb = iocb; - sock = iocb->ki_filp->private_data; - x->async_msg.msg_name = NULL; - x->async_msg.msg_namelen = 0; - x->async_msg.msg_iov = &x->async_iov; - x->async_msg.msg_iovlen = 1; - x->async_msg.msg_control = NULL; - x->async_msg.msg_controllen = 0; - x->async_iov.iov_base = ubuf; - x->async_iov.iov_len = size; - flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + siocb->kiocb = iocb; + siocb->async_iov.iov_base = ubuf; + siocb->async_iov.iov_len = size; - return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags); + iocb->private = siocb; + return siocb; } +static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, + struct file *file, struct iovec *iov, unsigned long nr_segs) +{ + struct socket *sock = file->private_data; + size_t size = 0; + int i; -/* - * Write data to a socket. We verify that the user area ubuf..ubuf+size-1 - * is readable by the user process. - */ + for (i = 0 ; i < nr_segs ; i++) + size += iov[i].iov_len; -static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, - size_t size, loff_t pos) + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *) iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + + return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); +} + +static ssize_t sock_readv(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { - struct sock_iocb *x, siocb; - struct socket *sock; - + struct kiocb iocb; + struct sock_iocb siocb; + struct msghdr msg; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + + ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} + +static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf, + size_t count, loff_t pos) +{ + struct sock_iocb siocb, *x; + if (pos != 0) return -ESPIPE; - if(size==0) /* Match SYS5 behaviour */ + if (count == 0) /* Match SYS5 behaviour */ return 0; - if (is_sync_kiocb(iocb)) - x = &siocb; - else { - x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); - if (!x) - return -ENOMEM; - iocb->ki_dtor = sock_aio_dtor; - } - iocb->private = x; - x->kiocb = iocb; - sock = iocb->ki_filp->private_data; - - x->async_msg.msg_name = NULL; - x->async_msg.msg_namelen = 0; - x->async_msg.msg_iov = &x->async_iov; - x->async_msg.msg_iovlen = 1; - x->async_msg.msg_control = NULL; - x->async_msg.msg_controllen = 0; - x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; - if (sock->type == SOCK_SEQPACKET) - x->async_msg.msg_flags |= MSG_EOR; - x->async_iov.iov_base = (void __user *)ubuf; - x->async_iov.iov_len = size; - - return __sock_sendmsg(iocb, sock, &x->async_msg, size); + x = alloc_sock_iocb(iocb, ubuf, count, &siocb); + if (!x) + return -ENOMEM; + return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, + &x->async_iov, 1); } -static ssize_t sock_sendpage(struct file *file, struct page *page, - int offset, size_t size, loff_t *ppos, int more) +static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, + struct file *file, struct iovec *iov, unsigned long nr_segs) { - struct socket *sock; - int flags; + struct socket *sock = file->private_data; + size_t size = 0; + int i; - sock = file->private_data; + for (i = 0 ; i < nr_segs ; i++) + size += iov[i].iov_len; - flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; - if (more) - flags |= MSG_MORE; + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *) iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + if (sock->type == SOCK_SEQPACKET) + msg->msg_flags |= MSG_EOR; - return sock->ops->sendpage(sock, page, offset, size, flags); + return __sock_sendmsg(iocb, sock, msg, size); } -static int sock_readv_writev(int type, - struct file * file, const struct iovec * iov, - long count, size_t size) +static ssize_t sock_writev(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct msghdr msg; - struct socket *sock; + struct kiocb iocb; + struct sock_iocb siocb; + int ret; - sock = file->private_data; + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_iov = (struct iovec *) iov; - msg.msg_iovlen = count; - msg.msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} - /* read() does a VERIFY_WRITE */ - if (type == VERIFY_WRITE) - return sock_recvmsg(sock, &msg, size, msg.msg_flags); +static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, + size_t count, loff_t pos) +{ + struct sock_iocb siocb, *x; - if (sock->type == SOCK_SEQPACKET) - msg.msg_flags |= MSG_EOR; + if (pos != 0) + return -ESPIPE; + if (count == 0) /* Match SYS5 behaviour */ + return 0; - return sock_sendmsg(sock, &msg, size); -} + x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb); + if (!x) + return -ENOMEM; -static ssize_t sock_readv(struct file *file, const struct iovec *vector, - unsigned long count, loff_t *ppos) -{ - size_t tot_len = 0; - int i; - for (i = 0 ; i < count ; i++) - tot_len += vector[i].iov_len; - return sock_readv_writev(VERIFY_WRITE, - file, vector, count, tot_len); -} - -static ssize_t sock_writev(struct file *file, const struct iovec *vector, - unsigned long count, loff_t *ppos) -{ - size_t tot_len = 0; - int i; - for (i = 0 ; i < count ; i++) - tot_len += vector[i].iov_len; - return sock_readv_writev(VERIFY_READ, - file, vector, count, tot_len); + return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, + &x->async_iov, 1); } @@ -904,6 +900,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; default: err = sock->ops->ioctl(sock, cmd, arg); + + /* + * If this ioctl is unknown try to hand it down + * to the NIC driver. + */ + if (err == -ENOIOCTLCMD) + err = dev_ioctl(cmd, argp); break; } return err; @@ -990,7 +993,7 @@ static int sock_fasync(int fd, struct file *filp, int on) if (on) { - fna=(struct fasync_struct *)kmalloc(sizeof(struct fasync_struct), GFP_KERNEL); + fna = kmalloc(sizeof(struct fasync_struct), GFP_KERNEL); if(fna==NULL) return -ENOMEM; } @@ -2036,7 +2039,7 @@ int sock_unregister(int family) return 0; } -void __init sock_init(void) +static int __init sock_init(void) { /* * Initialize sock SLAB cache. @@ -2044,12 +2047,10 @@ void __init sock_init(void) sk_init(); -#ifdef SLAB_SKB /* * Initialize skbuff SLAB cache */ skb_init(); -#endif /* * Initialize the protocols module. @@ -2058,15 +2059,19 @@ void __init sock_init(void) init_inodecache(); register_filesystem(&sock_fs_type); sock_mnt = kern_mount(&sock_fs_type); - /* The real protocol initialization is performed when - * do_initcalls is run. + + /* The real protocol initialization is performed in later initcalls. */ #ifdef CONFIG_NETFILTER netfilter_init(); #endif + + return 0; } +core_initcall(sock_init); /* early initcall */ + #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 8c7756036e95..9ac1b8c26c01 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -94,7 +94,7 @@ rpcauth_init_credcache(struct rpc_auth *auth, unsigned long expire) struct rpc_cred_cache *new; int i; - new = (struct rpc_cred_cache *)kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; for (i = 0; i < RPC_CREDCACHE_NR; i++) diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 5f1f806a0b11..129e2bd36aff 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -97,13 +97,17 @@ get_key(const void *p, const void *end, struct crypto_tfm **res) alg_mode = CRYPTO_TFM_MODE_CBC; break; default: - dprintk("RPC: get_key: unsupported algorithm %d\n", alg); + printk("gss_kerberos_mech: unsupported algorithm %d\n", alg); goto out_err_free_key; } - if (!(*res = crypto_alloc_tfm(alg_name, alg_mode))) + if (!(*res = crypto_alloc_tfm(alg_name, alg_mode))) { + printk("gss_kerberos_mech: unable to initialize crypto algorithm %s\n", alg_name); goto out_err_free_key; - if (crypto_cipher_setkey(*res, key.data, key.len)) + } + if (crypto_cipher_setkey(*res, key.data, key.len)) { + printk("gss_kerberos_mech: error setting key for crypto algorithm %s\n", alg_name); goto out_err_free_tfm; + } kfree(key.data); return p; diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c index 39b3edc14694..58400807d4df 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_mech.c +++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c @@ -111,14 +111,18 @@ get_key(const void *p, const void *end, struct crypto_tfm **res, int *resalg) setkey = 0; break; default: - dprintk("RPC: SPKM3 get_key: unsupported algorithm %d", *resalg); + dprintk("gss_spkm3_mech: unsupported algorithm %d\n", *resalg); goto out_err_free_key; } - if (!(*res = crypto_alloc_tfm(alg_name, alg_mode))) + if (!(*res = crypto_alloc_tfm(alg_name, alg_mode))) { + printk("gss_spkm3_mech: unable to initialize crypto algorthm %s\n", alg_name); goto out_err_free_key; + } if (setkey) { - if (crypto_cipher_setkey(*res, key.data, key.len)) + if (crypto_cipher_setkey(*res, key.data, key.len)) { + printk("gss_spkm3_mech: error setting key for crypto algorthm %s\n", alg_name); goto out_err_free_tfm; + } } if(key.len > 0) diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c index d1e12b25d6e2..86fbf7c3e39c 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_seal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c @@ -59,7 +59,7 @@ spkm3_make_token(struct spkm3_ctx *ctx, char tokhdrbuf[25]; struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; struct xdr_netobj mic_hdr = {.len = 0, .data = tokhdrbuf}; - int tmsglen, tokenlen = 0; + int tokenlen = 0; unsigned char *ptr; s32 now; int ctxelen = 0, ctxzbit = 0; @@ -92,24 +92,23 @@ spkm3_make_token(struct spkm3_ctx *ctx, } if (toktype == SPKM_MIC_TOK) { - tmsglen = 0; /* Calculate checksum over the mic-header */ asn1_bitstring_len(&ctx->ctx_id, &ctxelen, &ctxzbit); spkm3_mic_header(&mic_hdr.data, &mic_hdr.len, ctx->ctx_id.data, ctxelen, ctxzbit); if (make_checksum(checksum_type, mic_hdr.data, mic_hdr.len, - text, &md5cksum)) + text, 0, &md5cksum)) goto out_err; asn1_bitstring_len(&md5cksum, &md5elen, &md5zbit); - tokenlen = 10 + ctxelen + 1 + 2 + md5elen + 1; + tokenlen = 10 + ctxelen + 1 + md5elen + 1; /* Create token header using generic routines */ - token->len = g_token_size(&ctx->mech_used, tokenlen + tmsglen); + token->len = g_token_size(&ctx->mech_used, tokenlen); ptr = token->data; - g_make_token_header(&ctx->mech_used, tokenlen + tmsglen, &ptr); + g_make_token_header(&ctx->mech_used, tokenlen, &ptr); spkm3_make_mic_token(&ptr, tokenlen, &mic_hdr, &md5cksum, md5elen, md5zbit); } else if (toktype == SPKM_WRAP_TOK) { /* Not Supported */ diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c index 1f824578d773..af0d7ce74686 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_token.c +++ b/net/sunrpc/auth_gss/gss_spkm3_token.c @@ -182,6 +182,7 @@ spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen, unsigned char *ct * *tokp points to the beginning of the SPKM_MIC token described * in rfc 2025, section 3.2.1: * + * toklen is the inner token length */ void spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hdr, struct xdr_netobj *md5cksum, int md5elen, int md5zbit) @@ -189,7 +190,7 @@ spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hd unsigned char *ict = *tokp; *(u8 *)ict++ = 0xa4; - *(u8 *)ict++ = toklen - 2; + *(u8 *)ict++ = toklen; memcpy(ict, mic_hdr->data, mic_hdr->len); ict += mic_hdr->len; diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c index 241d5b30dfcb..96851b0ba1ba 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_unseal.c @@ -95,7 +95,7 @@ spkm3_read_token(struct spkm3_ctx *ctx, ret = GSS_S_DEFECTIVE_TOKEN; code = make_checksum(CKSUMTYPE_RSA_MD5, ptr + 2, mic_hdrlen + 2, - message_buffer, &md5cksum); + message_buffer, 0, &md5cksum); if (code) goto out; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 890fb5ea0dcb..1b3ed4fd1987 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -70,7 +70,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) dprintk("RPC: allocating UNIX cred for uid %d gid %d\n", acred->uid, acred->gid); - if (!(cred = (struct unx_cred *) kmalloc(sizeof(*cred), GFP_KERNEL))) + if (!(cred = kmalloc(sizeof(*cred), GFP_KERNEL))) return ERR_PTR(-ENOMEM); atomic_set(&cred->uc_count, 1); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index f509e9992767..dcaa0c4453ff 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -575,12 +575,11 @@ cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) if (rp->q.list.next == &cd->queue) { spin_unlock(&queue_lock); up(&queue_io_sem); - if (rp->offset) - BUG(); + BUG_ON(rp->offset); return 0; } rq = container_of(rp->q.list.next, struct cache_request, q.list); - if (rq->q.reader) BUG(); + BUG_ON(rq->q.reader); if (rp->offset == 0) rq->readers++; spin_unlock(&queue_lock); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 61c3abeaccae..d2f0550c4ba0 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -118,7 +118,7 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname, goto out_err; err = -ENOMEM; - clnt = (struct rpc_clnt *) kmalloc(sizeof(*clnt), GFP_KERNEL); + clnt = kmalloc(sizeof(*clnt), GFP_KERNEL); if (!clnt) goto out_err; memset(clnt, 0, sizeof(*clnt)); @@ -225,7 +225,7 @@ rpc_clone_client(struct rpc_clnt *clnt) { struct rpc_clnt *new; - new = (struct rpc_clnt *)kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) goto out_no_clnt; memcpy(new, clnt, sizeof(*new)); @@ -268,7 +268,8 @@ rpc_shutdown_client(struct rpc_clnt *clnt) clnt->cl_oneshot = 0; clnt->cl_dead = 0; rpc_killall_tasks(clnt); - sleep_on_timeout(&destroy_wait, 1*HZ); + wait_event_timeout(destroy_wait, + !atomic_read(&clnt->cl_users), 1*HZ); } if (atomic_read(&clnt->cl_users) < 0) { @@ -374,19 +375,23 @@ out: * Default callback for async RPC calls */ static void -rpc_default_callback(struct rpc_task *task) +rpc_default_callback(struct rpc_task *task, void *data) { } +static const struct rpc_call_ops rpc_default_ops = { + .rpc_call_done = rpc_default_callback, +}; + /* * Export the signal mask handling for synchronous code that * sleeps on RPC calls */ -#define RPC_INTR_SIGNALS (sigmask(SIGINT) | sigmask(SIGQUIT) | sigmask(SIGKILL)) +#define RPC_INTR_SIGNALS (sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT) | sigmask(SIGTERM)) static void rpc_save_sigmask(sigset_t *oldset, int intr) { - unsigned long sigallow = 0; + unsigned long sigallow = sigmask(SIGKILL); sigset_t sigmask; /* Block all signals except those listed in sigallow */ @@ -432,7 +437,7 @@ int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) BUG_ON(flags & RPC_TASK_ASYNC); status = -ENOMEM; - task = rpc_new_task(clnt, NULL, flags); + task = rpc_new_task(clnt, flags, &rpc_default_ops, NULL); if (task == NULL) goto out; @@ -442,14 +447,15 @@ int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) rpc_call_setup(task, msg, 0); /* Set up the call info struct and execute the task */ - if (task->tk_status == 0) { + status = task->tk_status; + if (status == 0) { + atomic_inc(&task->tk_count); status = rpc_execute(task); - } else { - status = task->tk_status; - rpc_release_task(task); + if (status == 0) + status = task->tk_status; } - rpc_restore_sigmask(&oldset); + rpc_release_task(task); out: return status; } @@ -459,7 +465,7 @@ out: */ int rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags, - rpc_action callback, void *data) + const struct rpc_call_ops *tk_ops, void *data) { struct rpc_task *task; sigset_t oldset; @@ -472,12 +478,9 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags, flags |= RPC_TASK_ASYNC; /* Create/initialize a new RPC task */ - if (!callback) - callback = rpc_default_callback; status = -ENOMEM; - if (!(task = rpc_new_task(clnt, callback, flags))) + if (!(task = rpc_new_task(clnt, flags, tk_ops, data))) goto out; - task->tk_calldata = data; /* Mask signals on GSS_AUTH upcalls */ rpc_task_sigmask(task, &oldset); @@ -511,7 +514,7 @@ rpc_call_setup(struct rpc_task *task, struct rpc_message *msg, int flags) if (task->tk_status == 0) task->tk_action = call_start; else - task->tk_action = NULL; + task->tk_action = rpc_exit_task; } void @@ -536,6 +539,18 @@ size_t rpc_max_payload(struct rpc_clnt *clnt) } EXPORT_SYMBOL(rpc_max_payload); +/** + * rpc_force_rebind - force transport to check that remote port is unchanged + * @clnt: client to rebind + * + */ +void rpc_force_rebind(struct rpc_clnt *clnt) +{ + if (clnt->cl_autobind) + clnt->cl_port = 0; +} +EXPORT_SYMBOL(rpc_force_rebind); + /* * Restart an (async) RPC call. Usually called from within the * exit handler. @@ -642,24 +657,26 @@ call_reserveresult(struct rpc_task *task) /* * 2. Allocate the buffer. For details, see sched.c:rpc_malloc. - * (Note: buffer memory is freed in rpc_task_release). + * (Note: buffer memory is freed in xprt_release). */ static void call_allocate(struct rpc_task *task) { + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = task->tk_xprt; unsigned int bufsiz; dprintk("RPC: %4d call_allocate (status %d)\n", task->tk_pid, task->tk_status); task->tk_action = call_bind; - if (task->tk_buffer) + if (req->rq_buffer) return; /* FIXME: compute buffer requirements more exactly using * auth->au_wslack */ bufsiz = task->tk_msg.rpc_proc->p_bufsiz + RPC_SLACK_SPACE; - if (rpc_malloc(task, bufsiz << 1) != NULL) + if (xprt->ops->buf_alloc(task, bufsiz << 1) != NULL) return; printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task); @@ -702,14 +719,14 @@ call_encode(struct rpc_task *task) task->tk_pid, task->tk_status); /* Default buffer setup */ - bufsiz = task->tk_bufsize >> 1; - sndbuf->head[0].iov_base = (void *)task->tk_buffer; + bufsiz = req->rq_bufsize >> 1; + sndbuf->head[0].iov_base = (void *)req->rq_buffer; sndbuf->head[0].iov_len = bufsiz; sndbuf->tail[0].iov_len = 0; sndbuf->page_len = 0; sndbuf->len = 0; sndbuf->buflen = bufsiz; - rcvbuf->head[0].iov_base = (void *)((char *)task->tk_buffer + bufsiz); + rcvbuf->head[0].iov_base = (void *)((char *)req->rq_buffer + bufsiz); rcvbuf->head[0].iov_len = bufsiz; rcvbuf->tail[0].iov_len = 0; rcvbuf->page_len = 0; @@ -849,8 +866,7 @@ call_connect_status(struct rpc_task *task) } /* Something failed: remote service port may have changed */ - if (clnt->cl_autobind) - clnt->cl_port = 0; + rpc_force_rebind(clnt); switch (status) { case -ENOTCONN: @@ -892,7 +908,7 @@ call_transmit(struct rpc_task *task) if (task->tk_status < 0) return; if (!task->tk_msg.rpc_proc->p_decode) { - task->tk_action = NULL; + task->tk_action = rpc_exit_task; rpc_wake_up_task(task); } return; @@ -931,8 +947,7 @@ call_status(struct rpc_task *task) break; case -ECONNREFUSED: case -ENOTCONN: - if (clnt->cl_autobind) - clnt->cl_port = 0; + rpc_force_rebind(clnt); task->tk_action = call_bind; break; case -EAGAIN: @@ -943,8 +958,7 @@ call_status(struct rpc_task *task) rpc_exit(task, status); break; default: - if (clnt->cl_chatty) - printk("%s: RPC call returned error %d\n", + printk("%s: RPC call returned error %d\n", clnt->cl_protname, -status); rpc_exit(task, status); break; @@ -979,20 +993,18 @@ call_timeout(struct rpc_task *task) dprintk("RPC: %4d call_timeout (major)\n", task->tk_pid); if (RPC_IS_SOFT(task)) { - if (clnt->cl_chatty) - printk(KERN_NOTICE "%s: server %s not responding, timed out\n", + printk(KERN_NOTICE "%s: server %s not responding, timed out\n", clnt->cl_protname, clnt->cl_server); rpc_exit(task, -EIO); return; } - if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN)) { + if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) { task->tk_flags |= RPC_CALL_MAJORSEEN; printk(KERN_NOTICE "%s: server %s not responding, still trying\n", clnt->cl_protname, clnt->cl_server); } - if (clnt->cl_autobind) - clnt->cl_port = 0; + rpc_force_rebind(clnt); retry: clnt->cl_stats->rpcretrans++; @@ -1014,7 +1026,7 @@ call_decode(struct rpc_task *task) dprintk("RPC: %4d call_decode (status %d)\n", task->tk_pid, task->tk_status); - if (clnt->cl_chatty && (task->tk_flags & RPC_CALL_MAJORSEEN)) { + if (task->tk_flags & RPC_CALL_MAJORSEEN) { printk(KERN_NOTICE "%s: server %s OK\n", clnt->cl_protname, clnt->cl_server); task->tk_flags &= ~RPC_CALL_MAJORSEEN; @@ -1039,13 +1051,14 @@ call_decode(struct rpc_task *task) sizeof(req->rq_rcv_buf)) != 0); /* Verify the RPC header */ - if (!(p = call_verify(task))) { - if (task->tk_action == NULL) - return; - goto out_retry; + p = call_verify(task); + if (IS_ERR(p)) { + if (p == ERR_PTR(-EAGAIN)) + goto out_retry; + return; } - task->tk_action = NULL; + task->tk_action = rpc_exit_task; if (decode) task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, @@ -1138,7 +1151,7 @@ call_verify(struct rpc_task *task) if ((n = ntohl(*p++)) != RPC_REPLY) { printk(KERN_WARNING "call_verify: not an RPC reply: %x\n", n); - goto out_retry; + goto out_garbage; } if ((n = ntohl(*p++)) != RPC_MSG_ACCEPTED) { if (--len < 0) @@ -1168,7 +1181,7 @@ call_verify(struct rpc_task *task) task->tk_pid); rpcauth_invalcred(task); task->tk_action = call_refresh; - return NULL; + goto out_retry; case RPC_AUTH_BADCRED: case RPC_AUTH_BADVERF: /* possibly garbled cred/verf? */ @@ -1178,7 +1191,7 @@ call_verify(struct rpc_task *task) dprintk("RPC: %4d call_verify: retry garbled creds\n", task->tk_pid); task->tk_action = call_bind; - return NULL; + goto out_retry; case RPC_AUTH_TOOWEAK: printk(KERN_NOTICE "call_verify: server requires stronger " "authentication.\n"); @@ -1193,7 +1206,7 @@ call_verify(struct rpc_task *task) } if (!(p = rpcauth_checkverf(task, p))) { printk(KERN_WARNING "call_verify: auth check failed\n"); - goto out_retry; /* bad verifier, retry */ + goto out_garbage; /* bad verifier, retry */ } len = p - (u32 *)iov->iov_base - 1; if (len < 0) @@ -1230,23 +1243,24 @@ call_verify(struct rpc_task *task) /* Also retry */ } -out_retry: +out_garbage: task->tk_client->cl_stats->rpcgarbage++; if (task->tk_garb_retry) { task->tk_garb_retry--; dprintk("RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid); task->tk_action = call_bind; - return NULL; +out_retry: + return ERR_PTR(-EAGAIN); } printk(KERN_WARNING "RPC %s: retry failed, exit EIO\n", __FUNCTION__); out_eio: error = -EIO; out_err: rpc_exit(task, error); - return NULL; + return ERR_PTR(error); out_overflow: printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__); - goto out_retry; + goto out_garbage; } static int rpcproc_encode_null(void *rqstp, u32 *data, void *obj) diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c index a398575f94b8..8139ce68e915 100644 --- a/net/sunrpc/pmap_clnt.c +++ b/net/sunrpc/pmap_clnt.c @@ -90,8 +90,7 @@ bailout: map->pm_binding = 0; rpc_wake_up(&map->pm_bindwait); spin_unlock(&pmap_lock); - task->tk_status = -EIO; - task->tk_action = NULL; + rpc_exit(task, -EIO); } #ifdef CONFIG_ROOT_NFS @@ -132,21 +131,22 @@ static void pmap_getport_done(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = task->tk_xprt; struct rpc_portmap *map = clnt->cl_pmap; dprintk("RPC: %4d pmap_getport_done(status %d, port %d)\n", task->tk_pid, task->tk_status, clnt->cl_port); + + xprt->ops->set_port(xprt, 0); if (task->tk_status < 0) { /* Make the calling task exit with an error */ - task->tk_action = NULL; + task->tk_action = rpc_exit_task; } else if (clnt->cl_port == 0) { /* Program not registered */ - task->tk_status = -EACCES; - task->tk_action = NULL; + rpc_exit(task, -EACCES); } else { - /* byte-swap port number first */ + xprt->ops->set_port(xprt, clnt->cl_port); clnt->cl_port = htons(clnt->cl_port); - clnt->cl_xprt->addr.sin_port = clnt->cl_port; } spin_lock(&pmap_lock); map->pm_binding = 0; @@ -207,7 +207,7 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileg xprt = xprt_create_proto(proto, srvaddr, NULL); if (IS_ERR(xprt)) return (struct rpc_clnt *)xprt; - xprt->addr.sin_port = htons(RPC_PMAP_PORT); + xprt->ops->set_port(xprt, RPC_PMAP_PORT); if (!privileged) xprt->resvport = 0; @@ -217,7 +217,6 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileg RPC_AUTH_UNIX); if (!IS_ERR(clnt)) { clnt->cl_softrtry = 1; - clnt->cl_chatty = 1; clnt->cl_oneshot = 1; } return clnt; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 16a2458f38f7..9764c80ab0b2 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -69,10 +69,13 @@ rpc_timeout_upcall_queue(void *data) struct rpc_inode *rpci = (struct rpc_inode *)data; struct inode *inode = &rpci->vfs_inode; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); + if (rpci->ops == NULL) + goto out; if (rpci->nreaders == 0 && !list_empty(&rpci->pipe)) __rpc_purge_upcall(inode, -ETIMEDOUT); - up(&inode->i_sem); +out: + mutex_unlock(&inode->i_mutex); } int @@ -81,7 +84,7 @@ rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg) struct rpc_inode *rpci = RPC_I(inode); int res = -EPIPE; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); if (rpci->ops == NULL) goto out; if (rpci->nreaders) { @@ -97,7 +100,7 @@ rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg) res = 0; } out: - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); wake_up(&rpci->waitq); return res; } @@ -113,9 +116,7 @@ rpc_close_pipes(struct inode *inode) { struct rpc_inode *rpci = RPC_I(inode); - cancel_delayed_work(&rpci->queue_timeout); - flush_scheduled_work(); - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); if (rpci->ops != NULL) { rpci->nreaders = 0; __rpc_purge_list(rpci, &rpci->in_upcall, -EPIPE); @@ -126,7 +127,9 @@ rpc_close_pipes(struct inode *inode) rpci->ops = NULL; } rpc_inode_setowner(inode, NULL); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); + cancel_delayed_work(&rpci->queue_timeout); + flush_scheduled_work(); } static struct inode * @@ -151,7 +154,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp) struct rpc_inode *rpci = RPC_I(inode); int res = -ENXIO; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); if (rpci->ops != NULL) { if (filp->f_mode & FMODE_READ) rpci->nreaders ++; @@ -159,17 +162,17 @@ rpc_pipe_open(struct inode *inode, struct file *filp) rpci->nwriters ++; res = 0; } - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return res; } static int rpc_pipe_release(struct inode *inode, struct file *filp) { - struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); + struct rpc_inode *rpci = RPC_I(inode); struct rpc_pipe_msg *msg; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); if (rpci->ops == NULL) goto out; msg = (struct rpc_pipe_msg *)filp->private_data; @@ -187,7 +190,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) if (rpci->ops->release_pipe) rpci->ops->release_pipe(inode); out: - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return 0; } @@ -199,7 +202,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) struct rpc_pipe_msg *msg; int res = 0; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); if (rpci->ops == NULL) { res = -EPIPE; goto out_unlock; @@ -226,7 +229,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) rpci->ops->destroy_msg(msg); } out_unlock: - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return res; } @@ -237,11 +240,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of struct rpc_inode *rpci = RPC_I(inode); int res; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); res = -EPIPE; if (rpci->ops != NULL) res = rpci->ops->downcall(filp, buf, len); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return res; } @@ -319,7 +322,7 @@ rpc_info_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *m = file->private_data; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); clnt = RPC_I(inode)->private; if (clnt) { atomic_inc(&clnt->cl_users); @@ -328,7 +331,7 @@ rpc_info_open(struct inode *inode, struct file *file) single_release(inode, file); ret = -EINVAL; } - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); } return ret; } @@ -488,11 +491,11 @@ rpc_depopulate(struct dentry *parent) struct dentry *dentry, *dvec[10]; int n = 0; - down(&dir->i_sem); + mutex_lock(&dir->i_mutex); repeat: spin_lock(&dcache_lock); list_for_each_safe(pos, next, &parent->d_subdirs) { - dentry = list_entry(pos, struct dentry, d_child); + dentry = list_entry(pos, struct dentry, d_u.d_child); spin_lock(&dentry->d_lock); if (!d_unhashed(dentry)) { dget_locked(dentry); @@ -516,7 +519,7 @@ repeat: } while (n); goto repeat; } - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); } static int @@ -529,7 +532,7 @@ rpc_populate(struct dentry *parent, struct dentry *dentry; int mode, i; - down(&dir->i_sem); + mutex_lock(&dir->i_mutex); for (i = start; i < eof; i++) { dentry = d_alloc_name(parent, files[i].name); if (!dentry) @@ -549,10 +552,10 @@ rpc_populate(struct dentry *parent, dir->i_nlink++; d_add(dentry, inode); } - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); return 0; out_bad: - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); printk(KERN_WARNING "%s: %s failed to populate directory %s\n", __FILE__, __FUNCTION__, parent->d_name.name); return -ENOMEM; @@ -606,7 +609,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd) if ((error = rpc_lookup_parent(path, nd)) != 0) return ERR_PTR(error); dir = nd->dentry->d_inode; - down(&dir->i_sem); + mutex_lock(&dir->i_mutex); dentry = lookup_hash(nd); if (IS_ERR(dentry)) goto out_err; @@ -617,7 +620,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd) } return dentry; out_err: - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); rpc_release_path(nd); return dentry; } @@ -643,7 +646,7 @@ rpc_mkdir(char *path, struct rpc_clnt *rpc_client) if (error) goto err_depopulate; out: - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); rpc_release_path(&nd); return dentry; err_depopulate: @@ -668,7 +671,7 @@ rpc_rmdir(char *path) if ((error = rpc_lookup_parent(path, &nd)) != 0) return error; dir = nd.dentry->d_inode; - down(&dir->i_sem); + mutex_lock(&dir->i_mutex); dentry = lookup_hash(&nd); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); @@ -678,7 +681,7 @@ rpc_rmdir(char *path) error = __rpc_rmdir(dir, dentry); dput(dentry); out_release: - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); rpc_release_path(&nd); return error; } @@ -707,7 +710,7 @@ rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags) rpci->ops = ops; inode_dir_notify(dir, DN_CREATE); out: - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); rpc_release_path(&nd); return dentry; err_dput: @@ -729,7 +732,7 @@ rpc_unlink(char *path) if ((error = rpc_lookup_parent(path, &nd)) != 0) return error; dir = nd.dentry->d_inode; - down(&dir->i_sem); + mutex_lock(&dir->i_mutex); dentry = lookup_hash(&nd); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); @@ -743,7 +746,7 @@ rpc_unlink(char *path) dput(dentry); inode_dir_notify(dir, DN_DELETE); out_release: - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); rpc_release_path(&nd); return error; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 54e60a657500..7415406aa1ae 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -41,8 +41,6 @@ static mempool_t *rpc_buffer_mempool __read_mostly; static void __rpc_default_timer(struct rpc_task *task); static void rpciod_killall(void); -static void rpc_free(struct rpc_task *task); - static void rpc_async_schedule(void *); /* @@ -264,6 +262,35 @@ void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname) } EXPORT_SYMBOL(rpc_init_wait_queue); +static int rpc_wait_bit_interruptible(void *word) +{ + if (signal_pending(current)) + return -ERESTARTSYS; + schedule(); + return 0; +} + +/* + * Mark an RPC call as having completed by clearing the 'active' bit + */ +static inline void rpc_mark_complete_task(struct rpc_task *task) +{ + rpc_clear_active(task); + wake_up_bit(&task->tk_runstate, RPC_TASK_ACTIVE); +} + +/* + * Allow callers to wait for completion of an RPC call + */ +int __rpc_wait_for_completion_task(struct rpc_task *task, int (*action)(void *)) +{ + if (action == NULL) + action = rpc_wait_bit_interruptible; + return wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE, + action, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(__rpc_wait_for_completion_task); + /* * Make an RPC task runnable. * @@ -299,10 +326,7 @@ static void rpc_make_runnable(struct rpc_task *task) static inline void rpc_schedule_run(struct rpc_task *task) { - /* Don't run a child twice! */ - if (RPC_IS_ACTIVATED(task)) - return; - task->tk_active = 1; + rpc_set_active(task); rpc_make_runnable(task); } @@ -324,8 +348,7 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, } /* Mark the task as being activated if so needed */ - if (!RPC_IS_ACTIVATED(task)) - task->tk_active = 1; + rpc_set_active(task); __rpc_add_wait_queue(q, task); @@ -555,36 +578,29 @@ __rpc_atrun(struct rpc_task *task) } /* - * Helper that calls task->tk_exit if it exists and then returns - * true if we should exit __rpc_execute. + * Helper to call task->tk_ops->rpc_call_prepare */ -static inline int __rpc_do_exit(struct rpc_task *task) +static void rpc_prepare_task(struct rpc_task *task) { - if (task->tk_exit != NULL) { - lock_kernel(); - task->tk_exit(task); - unlock_kernel(); - /* If tk_action is non-null, we should restart the call */ - if (task->tk_action != NULL) { - if (!RPC_ASSASSINATED(task)) { - /* Release RPC slot and buffer memory */ - xprt_release(task); - rpc_free(task); - return 0; - } - printk(KERN_ERR "RPC: dead task tried to walk away.\n"); - } - } - return 1; + task->tk_ops->rpc_call_prepare(task, task->tk_calldata); } -static int rpc_wait_bit_interruptible(void *word) +/* + * Helper that calls task->tk_ops->rpc_call_done if it exists + */ +void rpc_exit_task(struct rpc_task *task) { - if (signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; + task->tk_action = NULL; + if (task->tk_ops->rpc_call_done != NULL) { + task->tk_ops->rpc_call_done(task, task->tk_calldata); + if (task->tk_action != NULL) { + WARN_ON(RPC_ASSASSINATED(task)); + /* Always release the RPC slot and buffer memory */ + xprt_release(task); + } + } } +EXPORT_SYMBOL(rpc_exit_task); /* * This is the RPC `scheduler' (or rather, the finite state machine). @@ -631,12 +647,11 @@ static int __rpc_execute(struct rpc_task *task) * by someone else. */ if (!RPC_IS_QUEUED(task)) { - if (task->tk_action != NULL) { - lock_kernel(); - task->tk_action(task); - unlock_kernel(); - } else if (__rpc_do_exit(task)) + if (task->tk_action == NULL) break; + lock_kernel(); + task->tk_action(task); + unlock_kernel(); } /* @@ -676,9 +691,9 @@ static int __rpc_execute(struct rpc_task *task) dprintk("RPC: %4d sync task resuming\n", task->tk_pid); } - dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status); - status = task->tk_status; - + dprintk("RPC: %4d, return %d, status %d\n", task->tk_pid, status, task->tk_status); + /* Wake up anyone who is waiting for task completion */ + rpc_mark_complete_task(task); /* Release all resources associated with the task */ rpc_release_task(task); return status; @@ -696,9 +711,7 @@ static int __rpc_execute(struct rpc_task *task) int rpc_execute(struct rpc_task *task) { - BUG_ON(task->tk_active); - - task->tk_active = 1; + rpc_set_active(task); rpc_set_running(task); return __rpc_execute(task); } @@ -708,17 +721,19 @@ static void rpc_async_schedule(void *arg) __rpc_execute((struct rpc_task *)arg); } -/* - * Allocate memory for RPC purposes. +/** + * rpc_malloc - allocate an RPC buffer + * @task: RPC task that will use this buffer + * @size: requested byte size * * We try to ensure that some NFS reads and writes can always proceed * by using a mempool when allocating 'small' buffers. * In order to avoid memory starvation triggering more writebacks of * NFS requests, we use GFP_NOFS rather than GFP_KERNEL. */ -void * -rpc_malloc(struct rpc_task *task, size_t size) +void * rpc_malloc(struct rpc_task *task, size_t size) { + struct rpc_rqst *req = task->tk_rqstp; gfp_t gfp; if (task->tk_flags & RPC_TASK_SWAPPER) @@ -727,42 +742,52 @@ rpc_malloc(struct rpc_task *task, size_t size) gfp = GFP_NOFS; if (size > RPC_BUFFER_MAXSIZE) { - task->tk_buffer = kmalloc(size, gfp); - if (task->tk_buffer) - task->tk_bufsize = size; + req->rq_buffer = kmalloc(size, gfp); + if (req->rq_buffer) + req->rq_bufsize = size; } else { - task->tk_buffer = mempool_alloc(rpc_buffer_mempool, gfp); - if (task->tk_buffer) - task->tk_bufsize = RPC_BUFFER_MAXSIZE; + req->rq_buffer = mempool_alloc(rpc_buffer_mempool, gfp); + if (req->rq_buffer) + req->rq_bufsize = RPC_BUFFER_MAXSIZE; } - return task->tk_buffer; + return req->rq_buffer; } -static void -rpc_free(struct rpc_task *task) +/** + * rpc_free - free buffer allocated via rpc_malloc + * @task: RPC task with a buffer to be freed + * + */ +void rpc_free(struct rpc_task *task) { - if (task->tk_buffer) { - if (task->tk_bufsize == RPC_BUFFER_MAXSIZE) - mempool_free(task->tk_buffer, rpc_buffer_mempool); + struct rpc_rqst *req = task->tk_rqstp; + + if (req->rq_buffer) { + if (req->rq_bufsize == RPC_BUFFER_MAXSIZE) + mempool_free(req->rq_buffer, rpc_buffer_mempool); else - kfree(task->tk_buffer); - task->tk_buffer = NULL; - task->tk_bufsize = 0; + kfree(req->rq_buffer); + req->rq_buffer = NULL; + req->rq_bufsize = 0; } } /* * Creation and deletion of RPC task structures */ -void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action callback, int flags) +void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, const struct rpc_call_ops *tk_ops, void *calldata) { memset(task, 0, sizeof(*task)); init_timer(&task->tk_timer); task->tk_timer.data = (unsigned long) task; task->tk_timer.function = (void (*)(unsigned long)) rpc_run_timer; + atomic_set(&task->tk_count, 1); task->tk_client = clnt; task->tk_flags = flags; - task->tk_exit = callback; + task->tk_ops = tk_ops; + if (tk_ops->rpc_call_prepare != NULL) + task->tk_action = rpc_prepare_task; + task->tk_calldata = calldata; /* Initialize retry counters */ task->tk_garb_retry = 2; @@ -791,6 +816,8 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action call list_add_tail(&task->tk_task, &all_tasks); spin_unlock(&rpc_sched_lock); + BUG_ON(task->tk_ops == NULL); + dprintk("RPC: %4d new task procpid %d\n", task->tk_pid, current->pid); } @@ -801,8 +828,7 @@ rpc_alloc_task(void) return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); } -static void -rpc_default_free_task(struct rpc_task *task) +static void rpc_free_task(struct rpc_task *task) { dprintk("RPC: %4d freeing task\n", task->tk_pid); mempool_free(task, rpc_task_mempool); @@ -813,8 +839,7 @@ rpc_default_free_task(struct rpc_task *task) * clean up after an allocation failure, as the client may * have specified "oneshot". */ -struct rpc_task * -rpc_new_task(struct rpc_clnt *clnt, rpc_action callback, int flags) +struct rpc_task *rpc_new_task(struct rpc_clnt *clnt, int flags, const struct rpc_call_ops *tk_ops, void *calldata) { struct rpc_task *task; @@ -822,10 +847,7 @@ rpc_new_task(struct rpc_clnt *clnt, rpc_action callback, int flags) if (!task) goto cleanup; - rpc_init_task(task, clnt, callback, flags); - - /* Replace tk_release */ - task->tk_release = rpc_default_free_task; + rpc_init_task(task, clnt, flags, tk_ops, calldata); dprintk("RPC: %4d allocated task\n", task->tk_pid); task->tk_flags |= RPC_TASK_DYNAMIC; @@ -845,11 +867,15 @@ cleanup: void rpc_release_task(struct rpc_task *task) { - dprintk("RPC: %4d release task\n", task->tk_pid); + const struct rpc_call_ops *tk_ops = task->tk_ops; + void *calldata = task->tk_calldata; #ifdef RPC_DEBUG BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID); #endif + if (!atomic_dec_and_test(&task->tk_count)) + return; + dprintk("RPC: %4d release task\n", task->tk_pid); /* Remove from global task list */ spin_lock(&rpc_sched_lock); @@ -857,7 +883,6 @@ void rpc_release_task(struct rpc_task *task) spin_unlock(&rpc_sched_lock); BUG_ON (RPC_IS_QUEUED(task)); - task->tk_active = 0; /* Synchronously delete any running timer */ rpc_delete_timer(task); @@ -867,7 +892,6 @@ void rpc_release_task(struct rpc_task *task) xprt_release(task); if (task->tk_msg.rpc_cred) rpcauth_unbindcred(task); - rpc_free(task); if (task->tk_client) { rpc_release_client(task->tk_client); task->tk_client = NULL; @@ -876,11 +900,34 @@ void rpc_release_task(struct rpc_task *task) #ifdef RPC_DEBUG task->tk_magic = 0; #endif - if (task->tk_release) - task->tk_release(task); + if (task->tk_flags & RPC_TASK_DYNAMIC) + rpc_free_task(task); + if (tk_ops->rpc_release) + tk_ops->rpc_release(calldata); } /** + * rpc_run_task - Allocate a new RPC task, then run rpc_execute against it + * @clnt - pointer to RPC client + * @flags - RPC flags + * @ops - RPC call ops + * @data - user call data + */ +struct rpc_task *rpc_run_task(struct rpc_clnt *clnt, int flags, + const struct rpc_call_ops *ops, + void *data) +{ + struct rpc_task *task; + task = rpc_new_task(clnt, flags, ops, data); + if (task == NULL) + return ERR_PTR(-ENOMEM); + atomic_inc(&task->tk_count); + rpc_execute(task); + return task; +} +EXPORT_SYMBOL(rpc_run_task); + +/** * rpc_find_parent - find the parent of a child task. * @child: child task * @@ -890,12 +937,11 @@ void rpc_release_task(struct rpc_task *task) * * Caller must hold childq.lock */ -static inline struct rpc_task *rpc_find_parent(struct rpc_task *child) +static inline struct rpc_task *rpc_find_parent(struct rpc_task *child, struct rpc_task *parent) { - struct rpc_task *task, *parent; + struct rpc_task *task; struct list_head *le; - parent = (struct rpc_task *) child->tk_calldata; task_for_each(task, le, &childq.tasks[0]) if (task == parent) return parent; @@ -903,18 +949,22 @@ static inline struct rpc_task *rpc_find_parent(struct rpc_task *child) return NULL; } -static void rpc_child_exit(struct rpc_task *child) +static void rpc_child_exit(struct rpc_task *child, void *calldata) { struct rpc_task *parent; spin_lock_bh(&childq.lock); - if ((parent = rpc_find_parent(child)) != NULL) { + if ((parent = rpc_find_parent(child, calldata)) != NULL) { parent->tk_status = child->tk_status; __rpc_wake_up_task(parent); } spin_unlock_bh(&childq.lock); } +static const struct rpc_call_ops rpc_child_ops = { + .rpc_call_done = rpc_child_exit, +}; + /* * Note: rpc_new_task releases the client after a failure. */ @@ -923,11 +973,9 @@ rpc_new_child(struct rpc_clnt *clnt, struct rpc_task *parent) { struct rpc_task *task; - task = rpc_new_task(clnt, NULL, RPC_TASK_ASYNC | RPC_TASK_CHILD); + task = rpc_new_task(clnt, RPC_TASK_ASYNC | RPC_TASK_CHILD, &rpc_child_ops, parent); if (!task) goto fail; - task->tk_exit = rpc_child_exit; - task->tk_calldata = parent; return task; fail: @@ -1063,7 +1111,7 @@ void rpc_show_tasks(void) return; } printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout " - "-rpcwait -action- --exit--\n"); + "-rpcwait -action- ---ops--\n"); alltask_for_each(t, le, &all_tasks) { const char *rpc_waitq = "none"; @@ -1078,7 +1126,7 @@ void rpc_show_tasks(void) (t->tk_client ? t->tk_client->cl_prog : 0), t->tk_rqstp, t->tk_timeout, rpc_waitq, - t->tk_action, t->tk_exit); + t->tk_action, t->tk_ops); } spin_unlock(&rpc_sched_lock); } diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index a03d4b600c92..9f7373203592 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -30,8 +30,6 @@ EXPORT_SYMBOL(rpc_init_task); EXPORT_SYMBOL(rpc_sleep_on); EXPORT_SYMBOL(rpc_wake_up_next); EXPORT_SYMBOL(rpc_wake_up_task); -EXPORT_SYMBOL(rpc_new_child); -EXPORT_SYMBOL(rpc_run_child); EXPORT_SYMBOL(rpciod_down); EXPORT_SYMBOL(rpciod_up); EXPORT_SYMBOL(rpc_new_task); @@ -45,7 +43,6 @@ EXPORT_SYMBOL(rpc_clone_client); EXPORT_SYMBOL(rpc_bind_new_program); EXPORT_SYMBOL(rpc_destroy_client); EXPORT_SYMBOL(rpc_shutdown_client); -EXPORT_SYMBOL(rpc_release_client); EXPORT_SYMBOL(rpc_killall_tasks); EXPORT_SYMBOL(rpc_call_sync); EXPORT_SYMBOL(rpc_call_async); @@ -120,7 +117,6 @@ EXPORT_SYMBOL(unix_domain_find); /* Generic XDR */ EXPORT_SYMBOL(xdr_encode_string); -EXPORT_SYMBOL(xdr_decode_string); EXPORT_SYMBOL(xdr_decode_string_inplace); EXPORT_SYMBOL(xdr_decode_netobj); EXPORT_SYMBOL(xdr_encode_netobj); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index e4296c8b861e..b08419e1fc68 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -32,7 +32,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize) int vers; unsigned int xdrsize; - if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL))) + if (!(serv = kmalloc(sizeof(*serv), GFP_KERNEL))) return NULL; memset(serv, 0, sizeof(*serv)); serv->sv_name = prog->pg_name; @@ -122,8 +122,7 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size) rqstp->rq_argused = 0; rqstp->rq_resused = 0; arghi = 0; - if (pages > RPCSVC_MAXPAGES) - BUG(); + BUG_ON(pages > RPCSVC_MAXPAGES); while (pages) { struct page *p = alloc_page(GFP_KERNEL); if (!p) @@ -167,8 +166,8 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv) memset(rqstp, 0, sizeof(*rqstp)); init_waitqueue_head(&rqstp->rq_wait); - if (!(rqstp->rq_argp = (u32 *) kmalloc(serv->sv_xdrsize, GFP_KERNEL)) - || !(rqstp->rq_resp = (u32 *) kmalloc(serv->sv_xdrsize, GFP_KERNEL)) + if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL)) + || !(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL)) || !svc_init_buffer(rqstp, serv->sv_bufsz)) goto out_thread; diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index cac2e774dd81..3e6c694bbad1 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -101,10 +101,22 @@ static void ip_map_put(struct cache_head *item, struct cache_detail *cd) } } +#if IP_HASHBITS == 8 +/* hash_long on a 64 bit machine is currently REALLY BAD for + * IP addresses in reverse-endian (i.e. on a little-endian machine). + * So use a trivial but reliable hash instead + */ +static inline int hash_ip(unsigned long ip) +{ + int hash = ip ^ (ip>>16); + return (hash ^ (hash>>8)) & 0xff; +} +#endif + static inline int ip_map_hash(struct ip_map *item) { return hash_str(item->m_class, IP_HASHBITS) ^ - hash_long((unsigned long)item->m_addr.s_addr, IP_HASHBITS); + hash_ip((unsigned long)item->m_addr.s_addr); } static inline int ip_map_match(struct ip_map *item, struct ip_map *tmp) { diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c6a51911e71e..e67613e4eb18 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -758,7 +758,7 @@ svc_tcp_accept(struct svc_sock *svsk) struct svc_serv *serv = svsk->sk_server; struct socket *sock = svsk->sk_sock; struct socket *newsock; - struct proto_ops *ops; + const struct proto_ops *ops; struct svc_sock *newsvsk; int err, slen; @@ -1026,7 +1026,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) } else { printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", svsk->sk_server->sv_name, -len); - svc_sock_received(svsk); + goto err_delete; } return len; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index aaf08cdd19f0..ca4bfa57e116 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -93,27 +93,6 @@ xdr_encode_string(u32 *p, const char *string) } u32 * -xdr_decode_string(u32 *p, char **sp, int *lenp, int maxlen) -{ - unsigned int len; - char *string; - - if ((len = ntohl(*p++)) > maxlen) - return NULL; - if (lenp) - *lenp = len; - if ((len % 4) != 0) { - string = (char *) p; - } else { - string = (char *) (p - 1); - memmove(string, p, len); - } - string[len] = '\0'; - *sp = string; - return p + XDR_QUADLEN(len); -} - -u32 * xdr_decode_string_inplace(u32 *p, char **sp, int *lenp, int maxlen) { unsigned int len; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 6dda3860351f..8ff2c8acb223 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -119,6 +119,17 @@ out_sleep: return 0; } +static void xprt_clear_locked(struct rpc_xprt *xprt) +{ + xprt->snd_task = NULL; + if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state) || xprt->shutdown) { + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_clear_bit(); + } else + schedule_work(&xprt->task_cleanup); +} + /* * xprt_reserve_xprt_cong - serialize write access to transports * @task: task that is requesting access to the transport @@ -145,9 +156,7 @@ int xprt_reserve_xprt_cong(struct rpc_task *task) } return 1; } - smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_clear_bit(); + xprt_clear_locked(xprt); out_sleep: dprintk("RPC: %4d failed to lock transport %p\n", task->tk_pid, xprt); task->tk_timeout = 0; @@ -193,9 +202,7 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) return; out_unlock: - smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_clear_bit(); + xprt_clear_locked(xprt); } static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) @@ -222,9 +229,7 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) return; } out_unlock: - smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_clear_bit(); + xprt_clear_locked(xprt); } /** @@ -237,10 +242,7 @@ out_unlock: void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - xprt->snd_task = NULL; - smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_clear_bit(); + xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } } @@ -256,10 +258,7 @@ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - xprt->snd_task = NULL; - smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_clear_bit(); + xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } } @@ -535,10 +534,6 @@ void xprt_connect(struct rpc_task *task) dprintk("RPC: %4d xprt_connect xprt %p %s connected\n", task->tk_pid, xprt, (xprt_connected(xprt) ? "is" : "is not")); - if (xprt->shutdown) { - task->tk_status = -EIO; - return; - } if (!xprt->addr.sin_port) { task->tk_status = -EIO; return; @@ -687,9 +682,6 @@ int xprt_prepare_transmit(struct rpc_task *task) dprintk("RPC: %4d xprt_prepare_transmit\n", task->tk_pid); - if (xprt->shutdown) - return -EIO; - spin_lock_bh(&xprt->transport_lock); if (req->rq_received && !req->rq_bytes_sent) { err = req->rq_received; @@ -814,11 +806,9 @@ void xprt_reserve(struct rpc_task *task) struct rpc_xprt *xprt = task->tk_xprt; task->tk_status = -EIO; - if (!xprt->shutdown) { - spin_lock(&xprt->reserve_lock); - do_xprt_reserve(task); - spin_unlock(&xprt->reserve_lock); - } + spin_lock(&xprt->reserve_lock); + do_xprt_reserve(task); + spin_unlock(&xprt->reserve_lock); } static inline u32 xprt_alloc_xid(struct rpc_xprt *xprt) @@ -838,6 +828,8 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) req->rq_timeout = xprt->timeout.to_initval; req->rq_task = task; req->rq_xprt = xprt; + req->rq_buffer = NULL; + req->rq_bufsize = 0; req->rq_xid = xprt_alloc_xid(xprt); req->rq_release_snd_buf = NULL; dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, @@ -863,10 +855,11 @@ void xprt_release(struct rpc_task *task) if (!list_empty(&req->rq_list)) list_del(&req->rq_list); xprt->last_used = jiffies; - if (list_empty(&xprt->recv) && !xprt->shutdown) + if (list_empty(&xprt->recv)) mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout); spin_unlock_bh(&xprt->transport_lock); + xprt->ops->buf_free(task); task->tk_rqstp = NULL; if (req->rq_release_snd_buf) req->rq_release_snd_buf(req); @@ -974,16 +967,6 @@ struct rpc_xprt *xprt_create_proto(int proto, struct sockaddr_in *sap, struct rp return xprt; } -static void xprt_shutdown(struct rpc_xprt *xprt) -{ - xprt->shutdown = 1; - rpc_wake_up(&xprt->sending); - rpc_wake_up(&xprt->resend); - xprt_wake_pending_tasks(xprt, -EIO); - rpc_wake_up(&xprt->backlog); - del_timer_sync(&xprt->timer); -} - /** * xprt_destroy - destroy an RPC transport, killing off all requests. * @xprt: transport to destroy @@ -992,7 +975,8 @@ static void xprt_shutdown(struct rpc_xprt *xprt) int xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); - xprt_shutdown(xprt); + xprt->shutdown = 1; + del_timer_sync(&xprt->timer); xprt->ops->destroy(xprt); kfree(xprt); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 77e8800d4127..c458f8d1d6d1 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -28,6 +28,7 @@ #include <linux/udp.h> #include <linux/tcp.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/sched.h> #include <linux/file.h> #include <net/sock.h> @@ -424,7 +425,7 @@ static void xs_close(struct rpc_xprt *xprt) struct sock *sk = xprt->inet; if (!sk) - return; + goto clear_close_wait; dprintk("RPC: xs_close xprt %p\n", xprt); @@ -441,6 +442,10 @@ static void xs_close(struct rpc_xprt *xprt) sk->sk_no_check = 0; sock_release(sock); +clear_close_wait: + smp_mb__before_clear_bit(); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + smp_mb__after_clear_bit(); } /** @@ -800,9 +805,13 @@ static void xs_tcp_state_change(struct sock *sk) case TCP_SYN_SENT: case TCP_SYN_RECV: break; + case TCP_CLOSE_WAIT: + /* Try to schedule an autoclose RPC calls */ + set_bit(XPRT_CLOSE_WAIT, &xprt->state); + if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) + schedule_work(&xprt->task_cleanup); default: xprt_disconnect(xprt); - break; } out: read_unlock(&sk->sk_callback_lock); @@ -920,6 +929,18 @@ static void xs_udp_timer(struct rpc_task *task) xprt_adjust_cwnd(task, -ETIMEDOUT); } +/** + * xs_set_port - reset the port number in the remote endpoint address + * @xprt: generic transport + * @port: new port number + * + */ +static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) +{ + dprintk("RPC: setting port for xprt %p to %u\n", xprt, port); + xprt->addr.sin_port = htons(port); +} + static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) { struct sockaddr_in myaddr = { @@ -1160,7 +1181,10 @@ static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, + .set_port = xs_set_port, .connect = xs_connect, + .buf_alloc = rpc_malloc, + .buf_free = rpc_free, .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, .timer = xs_udp_timer, @@ -1172,7 +1196,10 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, + .set_port = xs_set_port, .connect = xs_connect, + .buf_alloc = rpc_malloc, + .buf_free = rpc_free, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig new file mode 100644 index 000000000000..05ab18e62dee --- /dev/null +++ b/net/tipc/Kconfig @@ -0,0 +1,112 @@ +# +# TIPC configuration +# + +menu "TIPC Configuration (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + +config TIPC + tristate "The TIPC Protocol (EXPERIMENTAL)" + ---help--- + TBD. + + This protocol support is also available as a module ( = code which + can be inserted in and removed from the running kernel whenever you + want). The module will be called tipc. If you want to compile it + as a module, say M here and read <file:Documentation/modules.txt>. + + If in doubt, say N. + +config TIPC_ADVANCED + bool "TIPC: Advanced configuration" + depends on TIPC + default n + help + Saying Y here will open some advanced configuration + for TIPC. Most users do not need to bother, so if + unsure, just say N. + +config TIPC_ZONES + int "Maximum number of zones in network" + depends on TIPC && TIPC_ADVANCED + default "3" + help + Max number of zones inside TIPC network. Max supported value + is 255 zones, minimum is 1 + + Default is 3 zones in a network; setting this to higher + allows more zones but might use more memory. + +config TIPC_CLUSTERS + int "Maximum number of clusters in a zone" + depends on TIPC && TIPC_ADVANCED + default "1" + help + ***Only 1 (one cluster in a zone) is supported by current code. + Any value set here will be overridden.*** + + (Max number of clusters inside TIPC zone. Max supported + value is 4095 clusters, minimum is 1. + + Default is 1; setting this to smaller value might save + some memory, setting it to higher + allows more clusters and might consume more memory.) + +config TIPC_NODES + int "Maximum number of nodes in cluster" + depends on TIPC && TIPC_ADVANCED + default "255" + help + Maximum number of nodes inside a TIPC cluster. Maximum + supported value is 2047 nodes, minimum is 8. + + Setting this to a smaller value saves some memory, + setting it to higher allows more nodes. + +config TIPC_SLAVE_NODES + int "Maximum number of slave nodes in cluster" + depends on TIPC && TIPC_ADVANCED + default "0" + help + ***This capability is not supported by current code.*** + + Maximum number of slave nodes inside a TIPC cluster. Maximum + supported value is 2047 nodes, minimum is 0. + + Setting this to a smaller value saves some memory, + setting it to higher allows more nodes. + +config TIPC_PORTS + int "Maximum number of ports in a node" + depends on TIPC && TIPC_ADVANCED + default "8191" + help + Maximum number of ports within a node. Maximum + supported value is 64535 nodes, minimum is 127. + + Setting this to a smaller value saves some memory, + setting it to higher allows more ports. + +config TIPC_LOG + int "Size of log buffer" + depends on TIPC && TIPC_ADVANCED + default 0 + help + Size (in bytes) of TIPC's internal log buffer, which records the + occurrence of significant events. Maximum supported value + is 32768 bytes, minimum is 0. + + There is no need to enable the log buffer unless the node will be + managed remotely via TIPC. + +config TIPC_DEBUG + bool "Enable debugging support" + depends on TIPC + default n + help + This will enable debugging of TIPC. + + Only say Y here if you are having trouble with TIPC. It will + enable the display of detailed information about what is going on. + +endmenu diff --git a/net/tipc/Makefile b/net/tipc/Makefile new file mode 100644 index 000000000000..dceb7027946c --- /dev/null +++ b/net/tipc/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for the Linux TIPC layer +# + +obj-$(CONFIG_TIPC) := tipc.o + +tipc-y += addr.o bcast.o bearer.o config.o cluster.o \ + core.o handler.o link.o discover.o msg.o \ + name_distr.o subscr.o name_table.o net.o \ + netlink.o node.o node_subscr.o port.o ref.o \ + socket.o user_reg.o zone.o dbg.o eth_media.o + +# End of file diff --git a/net/tipc/addr.c b/net/tipc/addr.c new file mode 100644 index 000000000000..eca22260c98c --- /dev/null +++ b/net/tipc/addr.c @@ -0,0 +1,94 @@ +/* + * net/tipc/addr.c: TIPC address utility routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "dbg.h" +#include "addr.h" +#include "zone.h" +#include "cluster.h" +#include "net.h" + +u32 tipc_get_addr(void) +{ + return tipc_own_addr; +} + +/** + * addr_domain_valid - validates a network domain address + * + * Accepts <Z.C.N>, <Z.C.0>, <Z.0.0>, and <0.0.0>, + * where Z, C, and N are non-zero and do not exceed the configured limits. + * + * Returns 1 if domain address is valid, otherwise 0 + */ + +int addr_domain_valid(u32 addr) +{ + u32 n = tipc_node(addr); + u32 c = tipc_cluster(addr); + u32 z = tipc_zone(addr); + u32 max_nodes = tipc_max_nodes; + + if (is_slave(addr)) + max_nodes = LOWEST_SLAVE + tipc_max_slaves; + if (n > max_nodes) + return 0; + if (c > tipc_max_clusters) + return 0; + if (z > tipc_max_zones) + return 0; + + if (n && (!z || !c)) + return 0; + if (c && !z) + return 0; + return 1; +} + +/** + * addr_node_valid - validates a proposed network address for this node + * + * Accepts <Z.C.N>, where Z, C, and N are non-zero and do not exceed + * the configured limits. + * + * Returns 1 if address can be used, otherwise 0 + */ + +int addr_node_valid(u32 addr) +{ + return (addr_domain_valid(addr) && tipc_node(addr)); +} + diff --git a/net/tipc/addr.h b/net/tipc/addr.h new file mode 100644 index 000000000000..02ca71783e2e --- /dev/null +++ b/net/tipc/addr.h @@ -0,0 +1,128 @@ +/* + * net/tipc/addr.h: Include file for TIPC address utility routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_ADDR_H +#define _TIPC_ADDR_H + +static inline u32 own_node(void) +{ + return tipc_node(tipc_own_addr); +} + +static inline u32 own_cluster(void) +{ + return tipc_cluster(tipc_own_addr); +} + +static inline u32 own_zone(void) +{ + return tipc_zone(tipc_own_addr); +} + +static inline int in_own_cluster(u32 addr) +{ + return !((addr ^ tipc_own_addr) >> 12); +} + +static inline int in_own_zone(u32 addr) +{ + return !((addr ^ tipc_own_addr) >> 24); +} + +static inline int is_slave(u32 addr) +{ + return addr & 0x800; +} + +static inline int may_route(u32 addr) +{ + return(addr ^ tipc_own_addr) >> 11; +} + +static inline int in_scope(u32 domain, u32 addr) +{ + if (!domain || (domain == addr)) + return 1; + if (domain == (addr & 0xfffff000u)) /* domain <Z.C.0> */ + return 1; + if (domain == (addr & 0xff000000u)) /* domain <Z.0.0> */ + return 1; + return 0; +} + +/** + * addr_scope - convert message lookup domain to equivalent 2-bit scope value + */ + +static inline int addr_scope(u32 domain) +{ + if (likely(!domain)) + return TIPC_ZONE_SCOPE; + if (tipc_node(domain)) + return TIPC_NODE_SCOPE; + if (tipc_cluster(domain)) + return TIPC_CLUSTER_SCOPE; + return TIPC_ZONE_SCOPE; +} + +/** + * addr_domain - convert 2-bit scope value to equivalent message lookup domain + * + * Needed when address of a named message must be looked up a second time + * after a network hop. + */ + +static inline int addr_domain(int sc) +{ + if (likely(sc == TIPC_NODE_SCOPE)) + return tipc_own_addr; + if (sc == TIPC_CLUSTER_SCOPE) + return tipc_addr(tipc_zone(tipc_own_addr), + tipc_cluster(tipc_own_addr), 0); + return tipc_addr(tipc_zone(tipc_own_addr), 0, 0); +} + +static inline char *addr_string_fill(char *string, u32 addr) +{ + snprintf(string, 16, "<%u.%u.%u>", + tipc_zone(addr), tipc_cluster(addr), tipc_node(addr)); + return string; +} + +int addr_domain_valid(u32); +int addr_node_valid(u32 addr); + +#endif diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c new file mode 100644 index 000000000000..9713d622efb8 --- /dev/null +++ b/net/tipc/bcast.c @@ -0,0 +1,806 @@ +/* + * net/tipc/bcast.c: TIPC broadcast code + * + * Copyright (c) 2004-2006, Ericsson AB + * Copyright (c) 2004, Intel Corporation. + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "msg.h" +#include "dbg.h" +#include "link.h" +#include "net.h" +#include "node.h" +#include "port.h" +#include "addr.h" +#include "node_subscr.h" +#include "name_distr.h" +#include "bearer.h" +#include "name_table.h" +#include "bcast.h" + + +#define MAX_PKT_DEFAULT_MCAST 1500 /* bcast link max packet size (fixed) */ + +#define BCLINK_WIN_DEFAULT 20 /* bcast link window size (default) */ + +#define BCLINK_LOG_BUF_SIZE 0 + +/** + * struct bcbearer_pair - a pair of bearers used by broadcast link + * @primary: pointer to primary bearer + * @secondary: pointer to secondary bearer + * + * Bearers must have same priority and same set of reachable destinations + * to be paired. + */ + +struct bcbearer_pair { + struct bearer *primary; + struct bearer *secondary; +}; + +/** + * struct bcbearer - bearer used by broadcast link + * @bearer: (non-standard) broadcast bearer structure + * @media: (non-standard) broadcast media structure + * @bpairs: array of bearer pairs + * @bpairs_temp: array of bearer pairs used during creation of "bpairs" + */ + +struct bcbearer { + struct bearer bearer; + struct media media; + struct bcbearer_pair bpairs[MAX_BEARERS]; + struct bcbearer_pair bpairs_temp[TIPC_NUM_LINK_PRI]; +}; + +/** + * struct bclink - link used for broadcast messages + * @link: (non-standard) broadcast link structure + * @node: (non-standard) node structure representing b'cast link's peer node + * + * Handles sequence numbering, fragmentation, bundling, etc. + */ + +struct bclink { + struct link link; + struct node node; +}; + + +static struct bcbearer *bcbearer = NULL; +static struct bclink *bclink = NULL; +static struct link *bcl = NULL; +static spinlock_t bc_lock = SPIN_LOCK_UNLOCKED; + +char bc_link_name[] = "multicast-link"; + + +static inline u32 buf_seqno(struct sk_buff *buf) +{ + return msg_seqno(buf_msg(buf)); +} + +static inline u32 bcbuf_acks(struct sk_buff *buf) +{ + return (u32)(unsigned long)TIPC_SKB_CB(buf)->handle; +} + +static inline void bcbuf_set_acks(struct sk_buff *buf, u32 acks) +{ + TIPC_SKB_CB(buf)->handle = (void *)(unsigned long)acks; +} + +static inline void bcbuf_decr_acks(struct sk_buff *buf) +{ + bcbuf_set_acks(buf, bcbuf_acks(buf) - 1); +} + + +/** + * bclink_set_gap - set gap according to contents of current deferred pkt queue + * + * Called with 'node' locked, bc_lock unlocked + */ + +static inline void bclink_set_gap(struct node *n_ptr) +{ + struct sk_buff *buf = n_ptr->bclink.deferred_head; + + n_ptr->bclink.gap_after = n_ptr->bclink.gap_to = + mod(n_ptr->bclink.last_in); + if (unlikely(buf != NULL)) + n_ptr->bclink.gap_to = mod(buf_seqno(buf) - 1); +} + +/** + * bclink_ack_allowed - test if ACK or NACK message can be sent at this moment + * + * This mechanism endeavours to prevent all nodes in network from trying + * to ACK or NACK at the same time. + * + * Note: TIPC uses a different trigger to distribute ACKs than it does to + * distribute NACKs, but tries to use the same spacing (divide by 16). + */ + +static inline int bclink_ack_allowed(u32 n) +{ + return((n % TIPC_MIN_LINK_WIN) == tipc_own_tag); +} + + +/** + * bclink_retransmit_pkt - retransmit broadcast packets + * @after: sequence number of last packet to *not* retransmit + * @to: sequence number of last packet to retransmit + * + * Called with 'node' locked, bc_lock unlocked + */ + +static void bclink_retransmit_pkt(u32 after, u32 to) +{ + struct sk_buff *buf; + + spin_lock_bh(&bc_lock); + buf = bcl->first_out; + while (buf && less_eq(buf_seqno(buf), after)) { + buf = buf->next; + } + if (buf != NULL) + link_retransmit(bcl, buf, mod(to - after)); + spin_unlock_bh(&bc_lock); +} + +/** + * bclink_acknowledge - handle acknowledgement of broadcast packets + * @n_ptr: node that sent acknowledgement info + * @acked: broadcast sequence # that has been acknowledged + * + * Node is locked, bc_lock unlocked. + */ + +void bclink_acknowledge(struct node *n_ptr, u32 acked) +{ + struct sk_buff *crs; + struct sk_buff *next; + unsigned int released = 0; + + if (less_eq(acked, n_ptr->bclink.acked)) + return; + + spin_lock_bh(&bc_lock); + + /* Skip over packets that node has previously acknowledged */ + + crs = bcl->first_out; + while (crs && less_eq(buf_seqno(crs), n_ptr->bclink.acked)) { + crs = crs->next; + } + + /* Update packets that node is now acknowledging */ + + while (crs && less_eq(buf_seqno(crs), acked)) { + next = crs->next; + bcbuf_decr_acks(crs); + if (bcbuf_acks(crs) == 0) { + bcl->first_out = next; + bcl->out_queue_size--; + buf_discard(crs); + released = 1; + } + crs = next; + } + n_ptr->bclink.acked = acked; + + /* Try resolving broadcast link congestion, if necessary */ + + if (unlikely(bcl->next_out)) + link_push_queue(bcl); + if (unlikely(released && !list_empty(&bcl->waiting_ports))) + link_wakeup_ports(bcl, 0); + spin_unlock_bh(&bc_lock); +} + +/** + * bclink_send_ack - unicast an ACK msg + * + * net_lock and node lock set + */ + +static void bclink_send_ack(struct node *n_ptr) +{ + struct link *l_ptr = n_ptr->active_links[n_ptr->addr & 1]; + + if (l_ptr != NULL) + link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); +} + +/** + * bclink_send_nack- broadcast a NACK msg + * + * net_lock and node lock set + */ + +static void bclink_send_nack(struct node *n_ptr) +{ + struct sk_buff *buf; + struct tipc_msg *msg; + + if (!less(n_ptr->bclink.gap_after, n_ptr->bclink.gap_to)) + return; + + buf = buf_acquire(INT_H_SIZE); + if (buf) { + msg = buf_msg(buf); + msg_init(msg, BCAST_PROTOCOL, STATE_MSG, + TIPC_OK, INT_H_SIZE, n_ptr->addr); + msg_set_mc_netid(msg, tipc_net_id); + msg_set_bcast_ack(msg, mod(n_ptr->bclink.last_in)); + msg_set_bcgap_after(msg, n_ptr->bclink.gap_after); + msg_set_bcgap_to(msg, n_ptr->bclink.gap_to); + msg_set_bcast_tag(msg, tipc_own_tag); + + if (bearer_send(&bcbearer->bearer, buf, 0)) { + bcl->stats.sent_nacks++; + buf_discard(buf); + } else { + bearer_schedule(bcl->b_ptr, bcl); + bcl->proto_msg_queue = buf; + bcl->stats.bearer_congs++; + } + + /* + * Ensure we doesn't send another NACK msg to the node + * until 16 more deferred messages arrive from it + * (i.e. helps prevent all nodes from NACK'ing at same time) + */ + + n_ptr->bclink.nack_sync = tipc_own_tag; + } +} + +/** + * bclink_check_gap - send a NACK if a sequence gap exists + * + * net_lock and node lock set + */ + +void bclink_check_gap(struct node *n_ptr, u32 last_sent) +{ + if (!n_ptr->bclink.supported || + less_eq(last_sent, mod(n_ptr->bclink.last_in))) + return; + + bclink_set_gap(n_ptr); + if (n_ptr->bclink.gap_after == n_ptr->bclink.gap_to) + n_ptr->bclink.gap_to = last_sent; + bclink_send_nack(n_ptr); +} + +/** + * bclink_peek_nack - process a NACK msg meant for another node + * + * Only net_lock set. + */ + +void bclink_peek_nack(u32 dest, u32 sender_tag, u32 gap_after, u32 gap_to) +{ + struct node *n_ptr = node_find(dest); + u32 my_after, my_to; + + if (unlikely(!n_ptr || !node_is_up(n_ptr))) + return; + node_lock(n_ptr); + /* + * Modify gap to suppress unnecessary NACKs from this node + */ + my_after = n_ptr->bclink.gap_after; + my_to = n_ptr->bclink.gap_to; + + if (less_eq(gap_after, my_after)) { + if (less(my_after, gap_to) && less(gap_to, my_to)) + n_ptr->bclink.gap_after = gap_to; + else if (less_eq(my_to, gap_to)) + n_ptr->bclink.gap_to = n_ptr->bclink.gap_after; + } else if (less_eq(gap_after, my_to)) { + if (less_eq(my_to, gap_to)) + n_ptr->bclink.gap_to = gap_after; + } else { + /* + * Expand gap if missing bufs not in deferred queue: + */ + struct sk_buff *buf = n_ptr->bclink.deferred_head; + u32 prev = n_ptr->bclink.gap_to; + + for (; buf; buf = buf->next) { + u32 seqno = buf_seqno(buf); + + if (mod(seqno - prev) != 1) + buf = NULL; + if (seqno == gap_after) + break; + prev = seqno; + } + if (buf == NULL) + n_ptr->bclink.gap_to = gap_after; + } + /* + * Some nodes may send a complementary NACK now: + */ + if (bclink_ack_allowed(sender_tag + 1)) { + if (n_ptr->bclink.gap_to != n_ptr->bclink.gap_after) { + bclink_send_nack(n_ptr); + bclink_set_gap(n_ptr); + } + } + node_unlock(n_ptr); +} + +/** + * bclink_send_msg - broadcast a packet to all nodes in cluster + */ + +int bclink_send_msg(struct sk_buff *buf) +{ + int res; + + spin_lock_bh(&bc_lock); + + res = link_send_buf(bcl, buf); + if (unlikely(res == -ELINKCONG)) + buf_discard(buf); + else + bcl->stats.sent_info++; + + if (bcl->out_queue_size > bcl->stats.max_queue_sz) + bcl->stats.max_queue_sz = bcl->out_queue_size; + bcl->stats.queue_sz_counts++; + bcl->stats.accu_queue_sz += bcl->out_queue_size; + + spin_unlock_bh(&bc_lock); + return res; +} + +/** + * bclink_recv_pkt - receive a broadcast packet, and deliver upwards + * + * net_lock is read_locked, no other locks set + */ + +void bclink_recv_pkt(struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + struct node* node = node_find(msg_prevnode(msg)); + u32 next_in; + u32 seqno; + struct sk_buff *deferred; + + msg_dbg(msg, "<BC<<<"); + + if (unlikely(!node || !node_is_up(node) || !node->bclink.supported || + (msg_mc_netid(msg) != tipc_net_id))) { + buf_discard(buf); + return; + } + + if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) { + msg_dbg(msg, "<BCNACK<<<"); + if (msg_destnode(msg) == tipc_own_addr) { + node_lock(node); + bclink_acknowledge(node, msg_bcast_ack(msg)); + node_unlock(node); + bcl->stats.recv_nacks++; + bclink_retransmit_pkt(msg_bcgap_after(msg), + msg_bcgap_to(msg)); + } else { + bclink_peek_nack(msg_destnode(msg), + msg_bcast_tag(msg), + msg_bcgap_after(msg), + msg_bcgap_to(msg)); + } + buf_discard(buf); + return; + } + + node_lock(node); +receive: + deferred = node->bclink.deferred_head; + next_in = mod(node->bclink.last_in + 1); + seqno = msg_seqno(msg); + + if (likely(seqno == next_in)) { + bcl->stats.recv_info++; + node->bclink.last_in++; + bclink_set_gap(node); + if (unlikely(bclink_ack_allowed(seqno))) { + bclink_send_ack(node); + bcl->stats.sent_acks++; + } + if (likely(msg_isdata(msg))) { + node_unlock(node); + port_recv_mcast(buf, NULL); + } else if (msg_user(msg) == MSG_BUNDLER) { + bcl->stats.recv_bundles++; + bcl->stats.recv_bundled += msg_msgcnt(msg); + node_unlock(node); + link_recv_bundle(buf); + } else if (msg_user(msg) == MSG_FRAGMENTER) { + bcl->stats.recv_fragments++; + if (link_recv_fragment(&node->bclink.defragm, + &buf, &msg)) + bcl->stats.recv_fragmented++; + node_unlock(node); + net_route_msg(buf); + } else { + node_unlock(node); + net_route_msg(buf); + } + if (deferred && (buf_seqno(deferred) == mod(next_in + 1))) { + node_lock(node); + buf = deferred; + msg = buf_msg(buf); + node->bclink.deferred_head = deferred->next; + goto receive; + } + return; + } else if (less(next_in, seqno)) { + u32 gap_after = node->bclink.gap_after; + u32 gap_to = node->bclink.gap_to; + + if (link_defer_pkt(&node->bclink.deferred_head, + &node->bclink.deferred_tail, + buf)) { + node->bclink.nack_sync++; + bcl->stats.deferred_recv++; + if (seqno == mod(gap_after + 1)) + node->bclink.gap_after = seqno; + else if (less(gap_after, seqno) && less(seqno, gap_to)) + node->bclink.gap_to = seqno; + } + if (bclink_ack_allowed(node->bclink.nack_sync)) { + if (gap_to != gap_after) + bclink_send_nack(node); + bclink_set_gap(node); + } + } else { + bcl->stats.duplicates++; + buf_discard(buf); + } + node_unlock(node); +} + +u32 bclink_get_last_sent(void) +{ + u32 last_sent = mod(bcl->next_out_no - 1); + + if (bcl->next_out) + last_sent = mod(buf_seqno(bcl->next_out) - 1); + return last_sent; +} + +u32 bclink_acks_missing(struct node *n_ptr) +{ + return (n_ptr->bclink.supported && + (bclink_get_last_sent() != n_ptr->bclink.acked)); +} + + +/** + * bcbearer_send - send a packet through the broadcast pseudo-bearer + * + * Send through as many bearers as necessary to reach all nodes + * that support TIPC multicasting. + * + * Returns 0 if packet sent successfully, non-zero if not + */ + +int bcbearer_send(struct sk_buff *buf, + struct tipc_bearer *unused1, + struct tipc_media_addr *unused2) +{ + static int send_count = 0; + + struct node_map remains; + struct node_map remains_new; + int bp_index; + int swap_time; + + /* Prepare buffer for broadcasting (if first time trying to send it) */ + + if (likely(!msg_non_seq(buf_msg(buf)))) { + struct tipc_msg *msg; + + assert(cluster_bcast_nodes.count != 0); + bcbuf_set_acks(buf, cluster_bcast_nodes.count); + msg = buf_msg(buf); + msg_set_non_seq(msg); + msg_set_mc_netid(msg, tipc_net_id); + } + + /* Determine if bearer pairs should be swapped following this attempt */ + + if ((swap_time = (++send_count >= 10))) + send_count = 0; + + /* Send buffer over bearers until all targets reached */ + + remains = cluster_bcast_nodes; + + for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) { + struct bearer *p = bcbearer->bpairs[bp_index].primary; + struct bearer *s = bcbearer->bpairs[bp_index].secondary; + + if (!p) + break; /* no more bearers to try */ + + nmap_diff(&remains, &p->nodes, &remains_new); + if (remains_new.count == remains.count) + continue; /* bearer pair doesn't add anything */ + + if (!p->publ.blocked && + !p->media->send_msg(buf, &p->publ, &p->media->bcast_addr)) { + if (swap_time && s && !s->publ.blocked) + goto swap; + else + goto update; + } + + if (!s || s->publ.blocked || + s->media->send_msg(buf, &s->publ, &s->media->bcast_addr)) + continue; /* unable to send using bearer pair */ +swap: + bcbearer->bpairs[bp_index].primary = s; + bcbearer->bpairs[bp_index].secondary = p; +update: + if (remains_new.count == 0) + return TIPC_OK; + + remains = remains_new; + } + + /* Unable to reach all targets */ + + bcbearer->bearer.publ.blocked = 1; + bcl->stats.bearer_congs++; + return ~TIPC_OK; +} + +/** + * bcbearer_sort - create sets of bearer pairs used by broadcast bearer + */ + +void bcbearer_sort(void) +{ + struct bcbearer_pair *bp_temp = bcbearer->bpairs_temp; + struct bcbearer_pair *bp_curr; + int b_index; + int pri; + + spin_lock_bh(&bc_lock); + + /* Group bearers by priority (can assume max of two per priority) */ + + memset(bp_temp, 0, sizeof(bcbearer->bpairs_temp)); + + for (b_index = 0; b_index < MAX_BEARERS; b_index++) { + struct bearer *b = &bearers[b_index]; + + if (!b->active || !b->nodes.count) + continue; + + if (!bp_temp[b->priority].primary) + bp_temp[b->priority].primary = b; + else + bp_temp[b->priority].secondary = b; + } + + /* Create array of bearer pairs for broadcasting */ + + bp_curr = bcbearer->bpairs; + memset(bcbearer->bpairs, 0, sizeof(bcbearer->bpairs)); + + for (pri = (TIPC_NUM_LINK_PRI - 1); pri >= 0; pri--) { + + if (!bp_temp[pri].primary) + continue; + + bp_curr->primary = bp_temp[pri].primary; + + if (bp_temp[pri].secondary) { + if (nmap_equal(&bp_temp[pri].primary->nodes, + &bp_temp[pri].secondary->nodes)) { + bp_curr->secondary = bp_temp[pri].secondary; + } else { + bp_curr++; + bp_curr->primary = bp_temp[pri].secondary; + } + } + + bp_curr++; + } + + spin_unlock_bh(&bc_lock); +} + +/** + * bcbearer_push - resolve bearer congestion + * + * Forces bclink to push out any unsent packets, until all packets are gone + * or congestion reoccurs. + * No locks set when function called + */ + +void bcbearer_push(void) +{ + struct bearer *b_ptr; + + spin_lock_bh(&bc_lock); + b_ptr = &bcbearer->bearer; + if (b_ptr->publ.blocked) { + b_ptr->publ.blocked = 0; + bearer_lock_push(b_ptr); + } + spin_unlock_bh(&bc_lock); +} + + +int bclink_stats(char *buf, const u32 buf_size) +{ + struct print_buf pb; + + if (!bcl) + return 0; + + printbuf_init(&pb, buf, buf_size); + + spin_lock_bh(&bc_lock); + + tipc_printf(&pb, "Link <%s>\n" + " Window:%u packets\n", + bcl->name, bcl->queue_limit[0]); + tipc_printf(&pb, " RX packets:%u fragments:%u/%u bundles:%u/%u\n", + bcl->stats.recv_info, + bcl->stats.recv_fragments, + bcl->stats.recv_fragmented, + bcl->stats.recv_bundles, + bcl->stats.recv_bundled); + tipc_printf(&pb, " TX packets:%u fragments:%u/%u bundles:%u/%u\n", + bcl->stats.sent_info, + bcl->stats.sent_fragments, + bcl->stats.sent_fragmented, + bcl->stats.sent_bundles, + bcl->stats.sent_bundled); + tipc_printf(&pb, " RX naks:%u defs:%u dups:%u\n", + bcl->stats.recv_nacks, + bcl->stats.deferred_recv, + bcl->stats.duplicates); + tipc_printf(&pb, " TX naks:%u acks:%u dups:%u\n", + bcl->stats.sent_nacks, + bcl->stats.sent_acks, + bcl->stats.retransmitted); + tipc_printf(&pb, " Congestion bearer:%u link:%u Send queue max:%u avg:%u\n", + bcl->stats.bearer_congs, + bcl->stats.link_congs, + bcl->stats.max_queue_sz, + bcl->stats.queue_sz_counts + ? (bcl->stats.accu_queue_sz / bcl->stats.queue_sz_counts) + : 0); + + spin_unlock_bh(&bc_lock); + return printbuf_validate(&pb); +} + +int bclink_reset_stats(void) +{ + if (!bcl) + return -ENOPROTOOPT; + + spin_lock_bh(&bc_lock); + memset(&bcl->stats, 0, sizeof(bcl->stats)); + spin_unlock_bh(&bc_lock); + return TIPC_OK; +} + +int bclink_set_queue_limits(u32 limit) +{ + if (!bcl) + return -ENOPROTOOPT; + if ((limit < TIPC_MIN_LINK_WIN) || (limit > TIPC_MAX_LINK_WIN)) + return -EINVAL; + + spin_lock_bh(&bc_lock); + link_set_queue_limits(bcl, limit); + spin_unlock_bh(&bc_lock); + return TIPC_OK; +} + +int bclink_init(void) +{ + bcbearer = kmalloc(sizeof(*bcbearer), GFP_ATOMIC); + bclink = kmalloc(sizeof(*bclink), GFP_ATOMIC); + if (!bcbearer || !bclink) { + nomem: + warn("Memory squeeze; Failed to create multicast link\n"); + kfree(bcbearer); + bcbearer = NULL; + kfree(bclink); + bclink = NULL; + return -ENOMEM; + } + + memset(bcbearer, 0, sizeof(struct bcbearer)); + INIT_LIST_HEAD(&bcbearer->bearer.cong_links); + bcbearer->bearer.media = &bcbearer->media; + bcbearer->media.send_msg = bcbearer_send; + sprintf(bcbearer->media.name, "tipc-multicast"); + + bcl = &bclink->link; + memset(bclink, 0, sizeof(struct bclink)); + INIT_LIST_HEAD(&bcl->waiting_ports); + bcl->next_out_no = 1; + bclink->node.lock = SPIN_LOCK_UNLOCKED; + bcl->owner = &bclink->node; + bcl->max_pkt = MAX_PKT_DEFAULT_MCAST; + link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); + bcl->b_ptr = &bcbearer->bearer; + bcl->state = WORKING_WORKING; + sprintf(bcl->name, bc_link_name); + + if (BCLINK_LOG_BUF_SIZE) { + char *pb = kmalloc(BCLINK_LOG_BUF_SIZE, GFP_ATOMIC); + + if (!pb) + goto nomem; + printbuf_init(&bcl->print_buf, pb, BCLINK_LOG_BUF_SIZE); + } + + return TIPC_OK; +} + +void bclink_stop(void) +{ + spin_lock_bh(&bc_lock); + if (bcbearer) { + link_stop(bcl); + if (BCLINK_LOG_BUF_SIZE) + kfree(bcl->print_buf.buf); + bcl = NULL; + kfree(bclink); + bclink = NULL; + kfree(bcbearer); + bcbearer = NULL; + } + spin_unlock_bh(&bc_lock); +} + diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h new file mode 100644 index 000000000000..5430e524b4f9 --- /dev/null +++ b/net/tipc/bcast.h @@ -0,0 +1,223 @@ +/* + * net/tipc/bcast.h: Include file for TIPC broadcast code + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_BCAST_H +#define _TIPC_BCAST_H + +#define MAX_NODES 4096 +#define WSIZE 32 + +/** + * struct node_map - set of node identifiers + * @count: # of nodes in set + * @map: bitmap of node identifiers that are in the set + */ + +struct node_map { + u32 count; + u32 map[MAX_NODES / WSIZE]; +}; + + +#define PLSIZE 32 + +/** + * struct port_list - set of node local destination ports + * @count: # of ports in set (only valid for first entry in list) + * @next: pointer to next entry in list + * @ports: array of port references + */ + +struct port_list { + int count; + struct port_list *next; + u32 ports[PLSIZE]; +}; + + +struct node; + +extern char bc_link_name[]; + + +/** + * nmap_get - determine if node exists in a node map + */ + +static inline int nmap_get(struct node_map *nm_ptr, u32 node) +{ + int n = tipc_node(node); + int w = n / WSIZE; + int b = n % WSIZE; + + return nm_ptr->map[w] & (1 << b); +} + +/** + * nmap_add - add a node to a node map + */ + +static inline void nmap_add(struct node_map *nm_ptr, u32 node) +{ + int n = tipc_node(node); + int w = n / WSIZE; + u32 mask = (1 << (n % WSIZE)); + + if ((nm_ptr->map[w] & mask) == 0) { + nm_ptr->count++; + nm_ptr->map[w] |= mask; + } +} + +/** + * nmap_remove - remove a node from a node map + */ + +static inline void nmap_remove(struct node_map *nm_ptr, u32 node) +{ + int n = tipc_node(node); + int w = n / WSIZE; + u32 mask = (1 << (n % WSIZE)); + + if ((nm_ptr->map[w] & mask) != 0) { + nm_ptr->map[w] &= ~mask; + nm_ptr->count--; + } +} + +/** + * nmap_equal - test for equality of node maps + */ + +static inline int nmap_equal(struct node_map *nm_a, struct node_map *nm_b) +{ + return !memcmp(nm_a, nm_b, sizeof(*nm_a)); +} + +/** + * nmap_diff - find differences between node maps + * @nm_a: input node map A + * @nm_b: input node map B + * @nm_diff: output node map A-B (i.e. nodes of A that are not in B) + */ + +static inline void nmap_diff(struct node_map *nm_a, struct node_map *nm_b, + struct node_map *nm_diff) +{ + int stop = sizeof(nm_a->map) / sizeof(u32); + int w; + int b; + u32 map; + + memset(nm_diff, 0, sizeof(*nm_diff)); + for (w = 0; w < stop; w++) { + map = nm_a->map[w] ^ (nm_a->map[w] & nm_b->map[w]); + nm_diff->map[w] = map; + if (map != 0) { + for (b = 0 ; b < WSIZE; b++) { + if (map & (1 << b)) + nm_diff->count++; + } + } + } +} + +/** + * port_list_add - add a port to a port list, ensuring no duplicates + */ + +static inline void port_list_add(struct port_list *pl_ptr, u32 port) +{ + struct port_list *item = pl_ptr; + int i; + int item_sz = PLSIZE; + int cnt = pl_ptr->count; + + for (; ; cnt -= item_sz, item = item->next) { + if (cnt < PLSIZE) + item_sz = cnt; + for (i = 0; i < item_sz; i++) + if (item->ports[i] == port) + return; + if (i < PLSIZE) { + item->ports[i] = port; + pl_ptr->count++; + return; + } + if (!item->next) { + item->next = kmalloc(sizeof(*item), GFP_ATOMIC); + if (!item->next) { + warn("Memory squeeze: multicast destination port list is incomplete\n"); + return; + } + item->next->next = NULL; + } + } +} + +/** + * port_list_free - free dynamically created entries in port_list chain + * + * Note: First item is on stack, so it doesn't need to be released + */ + +static inline void port_list_free(struct port_list *pl_ptr) +{ + struct port_list *item; + struct port_list *next; + + for (item = pl_ptr->next; item; item = next) { + next = item->next; + kfree(item); + } +} + + +int bclink_init(void); +void bclink_stop(void); +void bclink_acknowledge(struct node *n_ptr, u32 acked); +int bclink_send_msg(struct sk_buff *buf); +void bclink_recv_pkt(struct sk_buff *buf); +u32 bclink_get_last_sent(void); +u32 bclink_acks_missing(struct node *n_ptr); +void bclink_check_gap(struct node *n_ptr, u32 seqno); +int bclink_stats(char *stats_buf, const u32 buf_size); +int bclink_reset_stats(void); +int bclink_set_queue_limits(u32 limit); +void bcbearer_sort(void); +void bcbearer_push(void); + +#endif diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c new file mode 100644 index 000000000000..3dd19fdc5a2c --- /dev/null +++ b/net/tipc/bearer.c @@ -0,0 +1,692 @@ +/* + * net/tipc/bearer.c: TIPC bearer code + * + * Copyright (c) 1996-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include "dbg.h" +#include "bearer.h" +#include "link.h" +#include "port.h" +#include "discover.h" +#include "bcast.h" + +#define MAX_ADDR_STR 32 + +static struct media *media_list = 0; +static u32 media_count = 0; + +struct bearer *bearers = 0; + +/** + * media_name_valid - validate media name + * + * Returns 1 if media name is valid, otherwise 0. + */ + +static int media_name_valid(const char *name) +{ + u32 len; + + len = strlen(name); + if ((len + 1) > TIPC_MAX_MEDIA_NAME) + return 0; + return (strspn(name, tipc_alphabet) == len); +} + +/** + * media_find - locates specified media object by name + */ + +static struct media *media_find(const char *name) +{ + struct media *m_ptr; + u32 i; + + for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) { + if (!strcmp(m_ptr->name, name)) + return m_ptr; + } + return 0; +} + +/** + * tipc_register_media - register a media type + * + * Bearers for this media type must be activated separately at a later stage. + */ + +int tipc_register_media(u32 media_type, + char *name, + int (*enable)(struct tipc_bearer *), + void (*disable)(struct tipc_bearer *), + int (*send_msg)(struct sk_buff *, + struct tipc_bearer *, + struct tipc_media_addr *), + char *(*addr2str)(struct tipc_media_addr *a, + char *str_buf, int str_size), + struct tipc_media_addr *bcast_addr, + const u32 bearer_priority, + const u32 link_tolerance, /* [ms] */ + const u32 send_window_limit) +{ + struct media *m_ptr; + u32 media_id; + u32 i; + int res = -EINVAL; + + write_lock_bh(&net_lock); + if (!media_list) + goto exit; + + if (!media_name_valid(name)) { + warn("Media registration error: illegal name <%s>\n", name); + goto exit; + } + if (!bcast_addr) { + warn("Media registration error: no broadcast address supplied\n"); + goto exit; + } + if (bearer_priority >= TIPC_NUM_LINK_PRI) { + warn("Media registration error: priority %u\n", bearer_priority); + goto exit; + } + if ((link_tolerance < TIPC_MIN_LINK_TOL) || + (link_tolerance > TIPC_MAX_LINK_TOL)) { + warn("Media registration error: tolerance %u\n", link_tolerance); + goto exit; + } + + media_id = media_count++; + if (media_id >= MAX_MEDIA) { + warn("Attempt to register more than %u media\n", MAX_MEDIA); + media_count--; + goto exit; + } + for (i = 0; i < media_id; i++) { + if (media_list[i].type_id == media_type) { + warn("Attempt to register second media with type %u\n", + media_type); + media_count--; + goto exit; + } + if (!strcmp(name, media_list[i].name)) { + warn("Attempt to re-register media name <%s>\n", name); + media_count--; + goto exit; + } + } + + m_ptr = &media_list[media_id]; + m_ptr->type_id = media_type; + m_ptr->send_msg = send_msg; + m_ptr->enable_bearer = enable; + m_ptr->disable_bearer = disable; + m_ptr->addr2str = addr2str; + memcpy(&m_ptr->bcast_addr, bcast_addr, sizeof(*bcast_addr)); + m_ptr->bcast = 1; + strcpy(m_ptr->name, name); + m_ptr->priority = bearer_priority; + m_ptr->tolerance = link_tolerance; + m_ptr->window = send_window_limit; + dbg("Media <%s> registered\n", name); + res = 0; +exit: + write_unlock_bh(&net_lock); + return res; +} + +/** + * media_addr_printf - record media address in print buffer + */ + +void media_addr_printf(struct print_buf *pb, struct tipc_media_addr *a) +{ + struct media *m_ptr; + u32 media_type; + u32 i; + + media_type = ntohl(a->type); + for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) { + if (m_ptr->type_id == media_type) + break; + } + + if ((i < media_count) && (m_ptr->addr2str != NULL)) { + char addr_str[MAX_ADDR_STR]; + + tipc_printf(pb, "%s(%s) ", m_ptr->name, + m_ptr->addr2str(a, addr_str, sizeof(addr_str))); + } else { + unchar *addr = (unchar *)&a->dev_addr; + + tipc_printf(pb, "UNKNOWN(%u):", media_type); + for (i = 0; i < (sizeof(*a) - sizeof(a->type)); i++) { + tipc_printf(pb, "%02x ", addr[i]); + } + } +} + +/** + * media_get_names - record names of registered media in buffer + */ + +struct sk_buff *media_get_names(void) +{ + struct sk_buff *buf; + struct media *m_ptr; + int i; + + buf = cfg_reply_alloc(MAX_MEDIA * TLV_SPACE(TIPC_MAX_MEDIA_NAME)); + if (!buf) + return NULL; + + read_lock_bh(&net_lock); + for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) { + cfg_append_tlv(buf, TIPC_TLV_MEDIA_NAME, m_ptr->name, + strlen(m_ptr->name) + 1); + } + read_unlock_bh(&net_lock); + return buf; +} + +/** + * bearer_name_validate - validate & (optionally) deconstruct bearer name + * @name - ptr to bearer name string + * @name_parts - ptr to area for bearer name components (or NULL if not needed) + * + * Returns 1 if bearer name is valid, otherwise 0. + */ + +static int bearer_name_validate(const char *name, + struct bearer_name *name_parts) +{ + char name_copy[TIPC_MAX_BEARER_NAME]; + char *media_name; + char *if_name; + u32 media_len; + u32 if_len; + + /* copy bearer name & ensure length is OK */ + + name_copy[TIPC_MAX_BEARER_NAME - 1] = 0; + /* need above in case non-Posix strncpy() doesn't pad with nulls */ + strncpy(name_copy, name, TIPC_MAX_BEARER_NAME); + if (name_copy[TIPC_MAX_BEARER_NAME - 1] != 0) + return 0; + + /* ensure all component parts of bearer name are present */ + + media_name = name_copy; + if ((if_name = strchr(media_name, ':')) == NULL) + return 0; + *(if_name++) = 0; + media_len = if_name - media_name; + if_len = strlen(if_name) + 1; + + /* validate component parts of bearer name */ + + if ((media_len <= 1) || (media_len > TIPC_MAX_MEDIA_NAME) || + (if_len <= 1) || (if_len > TIPC_MAX_IF_NAME) || + (strspn(media_name, tipc_alphabet) != (media_len - 1)) || + (strspn(if_name, tipc_alphabet) != (if_len - 1))) + return 0; + + /* return bearer name components, if necessary */ + + if (name_parts) { + strcpy(name_parts->media_name, media_name); + strcpy(name_parts->if_name, if_name); + } + return 1; +} + +/** + * bearer_find - locates bearer object with matching bearer name + */ + +static struct bearer *bearer_find(const char *name) +{ + struct bearer *b_ptr; + u32 i; + + for (i = 0, b_ptr = bearers; i < MAX_BEARERS; i++, b_ptr++) { + if (b_ptr->active && (!strcmp(b_ptr->publ.name, name))) + return b_ptr; + } + return 0; +} + +/** + * bearer_find - locates bearer object with matching interface name + */ + +struct bearer *bearer_find_interface(const char *if_name) +{ + struct bearer *b_ptr; + char *b_if_name; + u32 i; + + for (i = 0, b_ptr = bearers; i < MAX_BEARERS; i++, b_ptr++) { + if (!b_ptr->active) + continue; + b_if_name = strchr(b_ptr->publ.name, ':') + 1; + if (!strcmp(b_if_name, if_name)) + return b_ptr; + } + return 0; +} + +/** + * bearer_get_names - record names of bearers in buffer + */ + +struct sk_buff *bearer_get_names(void) +{ + struct sk_buff *buf; + struct media *m_ptr; + struct bearer *b_ptr; + int i, j; + + buf = cfg_reply_alloc(MAX_BEARERS * TLV_SPACE(TIPC_MAX_BEARER_NAME)); + if (!buf) + return NULL; + + read_lock_bh(&net_lock); + for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) { + for (j = 0; j < MAX_BEARERS; j++) { + b_ptr = &bearers[j]; + if (b_ptr->active && (b_ptr->media == m_ptr)) { + cfg_append_tlv(buf, TIPC_TLV_BEARER_NAME, + b_ptr->publ.name, + strlen(b_ptr->publ.name) + 1); + } + } + } + read_unlock_bh(&net_lock); + return buf; +} + +void bearer_add_dest(struct bearer *b_ptr, u32 dest) +{ + nmap_add(&b_ptr->nodes, dest); + disc_update_link_req(b_ptr->link_req); + bcbearer_sort(); +} + +void bearer_remove_dest(struct bearer *b_ptr, u32 dest) +{ + nmap_remove(&b_ptr->nodes, dest); + disc_update_link_req(b_ptr->link_req); + bcbearer_sort(); +} + +/* + * bearer_push(): Resolve bearer congestion. Force the waiting + * links to push out their unsent packets, one packet per link + * per iteration, until all packets are gone or congestion reoccurs. + * 'net_lock' is read_locked when this function is called + * bearer.lock must be taken before calling + * Returns binary true(1) ore false(0) + */ +static int bearer_push(struct bearer *b_ptr) +{ + u32 res = TIPC_OK; + struct link *ln, *tln; + + if (b_ptr->publ.blocked) + return 0; + + while (!list_empty(&b_ptr->cong_links) && (res != PUSH_FAILED)) { + list_for_each_entry_safe(ln, tln, &b_ptr->cong_links, link_list) { + res = link_push_packet(ln); + if (res == PUSH_FAILED) + break; + if (res == PUSH_FINISHED) + list_move_tail(&ln->link_list, &b_ptr->links); + } + } + return list_empty(&b_ptr->cong_links); +} + +void bearer_lock_push(struct bearer *b_ptr) +{ + int res; + + spin_lock_bh(&b_ptr->publ.lock); + res = bearer_push(b_ptr); + spin_unlock_bh(&b_ptr->publ.lock); + if (res) + bcbearer_push(); +} + + +/* + * Interrupt enabling new requests after bearer congestion or blocking: + * See bearer_send(). + */ +void tipc_continue(struct tipc_bearer *tb_ptr) +{ + struct bearer *b_ptr = (struct bearer *)tb_ptr; + + spin_lock_bh(&b_ptr->publ.lock); + b_ptr->continue_count++; + if (!list_empty(&b_ptr->cong_links)) + k_signal((Handler)bearer_lock_push, (unsigned long)b_ptr); + b_ptr->publ.blocked = 0; + spin_unlock_bh(&b_ptr->publ.lock); +} + +/* + * Schedule link for sending of messages after the bearer + * has been deblocked by 'continue()'. This method is called + * when somebody tries to send a message via this link while + * the bearer is congested. 'net_lock' is in read_lock here + * bearer.lock is busy + */ + +static void bearer_schedule_unlocked(struct bearer *b_ptr, struct link *l_ptr) +{ + list_move_tail(&l_ptr->link_list, &b_ptr->cong_links); +} + +/* + * Schedule link for sending of messages after the bearer + * has been deblocked by 'continue()'. This method is called + * when somebody tries to send a message via this link while + * the bearer is congested. 'net_lock' is in read_lock here, + * bearer.lock is free + */ + +void bearer_schedule(struct bearer *b_ptr, struct link *l_ptr) +{ + spin_lock_bh(&b_ptr->publ.lock); + bearer_schedule_unlocked(b_ptr, l_ptr); + spin_unlock_bh(&b_ptr->publ.lock); +} + + +/* + * bearer_resolve_congestion(): Check if there is bearer congestion, + * and if there is, try to resolve it before returning. + * 'net_lock' is read_locked when this function is called + */ +int bearer_resolve_congestion(struct bearer *b_ptr, struct link *l_ptr) +{ + int res = 1; + + if (list_empty(&b_ptr->cong_links)) + return 1; + spin_lock_bh(&b_ptr->publ.lock); + if (!bearer_push(b_ptr)) { + bearer_schedule_unlocked(b_ptr, l_ptr); + res = 0; + } + spin_unlock_bh(&b_ptr->publ.lock); + return res; +} + + +/** + * tipc_enable_bearer - enable bearer with the given name + */ + +int tipc_enable_bearer(const char *name, u32 bcast_scope, u32 priority) +{ + struct bearer *b_ptr; + struct media *m_ptr; + struct bearer_name b_name; + char addr_string[16]; + u32 bearer_id; + u32 with_this_prio; + u32 i; + int res = -EINVAL; + + if (tipc_mode != TIPC_NET_MODE) + return -ENOPROTOOPT; + if (!bearer_name_validate(name, &b_name) || + !addr_domain_valid(bcast_scope) || + !in_scope(bcast_scope, tipc_own_addr) || + (priority > TIPC_NUM_LINK_PRI)) + return -EINVAL; + + write_lock_bh(&net_lock); + if (!bearers) + goto failed; + + m_ptr = media_find(b_name.media_name); + if (!m_ptr) { + warn("No media <%s>\n", b_name.media_name); + goto failed; + } + if (priority == TIPC_NUM_LINK_PRI) + priority = m_ptr->priority; + +restart: + bearer_id = MAX_BEARERS; + with_this_prio = 1; + for (i = MAX_BEARERS; i-- != 0; ) { + if (!bearers[i].active) { + bearer_id = i; + continue; + } + if (!strcmp(name, bearers[i].publ.name)) { + warn("Bearer <%s> already enabled\n", name); + goto failed; + } + if ((bearers[i].priority == priority) && + (++with_this_prio > 2)) { + if (priority-- == 0) { + warn("Third bearer <%s> with priority %u, unable to lower to %u\n", + name, priority + 1, priority); + goto failed; + } + warn("Third bearer <%s> with priority %u, lowering to %u\n", + name, priority + 1, priority); + goto restart; + } + } + if (bearer_id >= MAX_BEARERS) { + warn("Attempt to enable more than %d bearers\n", MAX_BEARERS); + goto failed; + } + + b_ptr = &bearers[bearer_id]; + memset(b_ptr, 0, sizeof(struct bearer)); + + strcpy(b_ptr->publ.name, name); + res = m_ptr->enable_bearer(&b_ptr->publ); + if (res) { + warn("Failed to enable bearer <%s>\n", name); + goto failed; + } + + b_ptr->identity = bearer_id; + b_ptr->media = m_ptr; + b_ptr->net_plane = bearer_id + 'A'; + b_ptr->active = 1; + b_ptr->detect_scope = bcast_scope; + b_ptr->priority = priority; + INIT_LIST_HEAD(&b_ptr->cong_links); + INIT_LIST_HEAD(&b_ptr->links); + if (m_ptr->bcast) { + b_ptr->link_req = disc_init_link_req(b_ptr, &m_ptr->bcast_addr, + bcast_scope, 2); + } + b_ptr->publ.lock = SPIN_LOCK_UNLOCKED; + write_unlock_bh(&net_lock); + info("Enabled bearer <%s>, discovery domain %s\n", + name, addr_string_fill(addr_string, bcast_scope)); + return 0; +failed: + write_unlock_bh(&net_lock); + return res; +} + +/** + * tipc_block_bearer(): Block the bearer with the given name, + * and reset all its links + */ + +int tipc_block_bearer(const char *name) +{ + struct bearer *b_ptr = 0; + struct link *l_ptr; + struct link *temp_l_ptr; + + if (tipc_mode != TIPC_NET_MODE) + return -ENOPROTOOPT; + + read_lock_bh(&net_lock); + b_ptr = bearer_find(name); + if (!b_ptr) { + warn("Attempt to block unknown bearer <%s>\n", name); + read_unlock_bh(&net_lock); + return -EINVAL; + } + + spin_lock_bh(&b_ptr->publ.lock); + b_ptr->publ.blocked = 1; + list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) { + struct node *n_ptr = l_ptr->owner; + + spin_lock_bh(&n_ptr->lock); + link_reset(l_ptr); + spin_unlock_bh(&n_ptr->lock); + } + spin_unlock_bh(&b_ptr->publ.lock); + read_unlock_bh(&net_lock); + info("Blocked bearer <%s>\n", name); + return TIPC_OK; +} + +/** + * bearer_disable - + * + * Note: This routine assumes caller holds net_lock. + */ + +static int bearer_disable(const char *name) +{ + struct bearer *b_ptr; + struct link *l_ptr; + struct link *temp_l_ptr; + + if (tipc_mode != TIPC_NET_MODE) + return -ENOPROTOOPT; + + b_ptr = bearer_find(name); + if (!b_ptr) { + warn("Attempt to disable unknown bearer <%s>\n", name); + return -EINVAL; + } + + disc_stop_link_req(b_ptr->link_req); + spin_lock_bh(&b_ptr->publ.lock); + b_ptr->link_req = NULL; + b_ptr->publ.blocked = 1; + if (b_ptr->media->disable_bearer) { + spin_unlock_bh(&b_ptr->publ.lock); + write_unlock_bh(&net_lock); + b_ptr->media->disable_bearer(&b_ptr->publ); + write_lock_bh(&net_lock); + spin_lock_bh(&b_ptr->publ.lock); + } + list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) { + link_delete(l_ptr); + } + spin_unlock_bh(&b_ptr->publ.lock); + info("Disabled bearer <%s>\n", name); + memset(b_ptr, 0, sizeof(struct bearer)); + return TIPC_OK; +} + +int tipc_disable_bearer(const char *name) +{ + int res; + + write_lock_bh(&net_lock); + res = bearer_disable(name); + write_unlock_bh(&net_lock); + return res; +} + + + +int bearer_init(void) +{ + int res; + + write_lock_bh(&net_lock); + bearers = kmalloc(MAX_BEARERS * sizeof(struct bearer), GFP_ATOMIC); + media_list = kmalloc(MAX_MEDIA * sizeof(struct media), GFP_ATOMIC); + if (bearers && media_list) { + memset(bearers, 0, MAX_BEARERS * sizeof(struct bearer)); + memset(media_list, 0, MAX_MEDIA * sizeof(struct media)); + res = TIPC_OK; + } else { + kfree(bearers); + kfree(media_list); + bearers = 0; + media_list = 0; + res = -ENOMEM; + } + write_unlock_bh(&net_lock); + return res; +} + +void bearer_stop(void) +{ + u32 i; + + if (!bearers) + return; + + for (i = 0; i < MAX_BEARERS; i++) { + if (bearers[i].active) + bearers[i].publ.blocked = 1; + } + for (i = 0; i < MAX_BEARERS; i++) { + if (bearers[i].active) + bearer_disable(bearers[i].publ.name); + } + kfree(bearers); + kfree(media_list); + bearers = 0; + media_list = 0; + media_count = 0; +} + + diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h new file mode 100644 index 000000000000..21e63d3f0183 --- /dev/null +++ b/net/tipc/bearer.h @@ -0,0 +1,172 @@ +/* + * net/tipc/bearer.h: Include file for TIPC bearer code + * + * Copyright (c) 1996-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_BEARER_H +#define _TIPC_BEARER_H + +#include <net/tipc/tipc_bearer.h> +#include "bcast.h" + +#define MAX_BEARERS 8 +#define MAX_MEDIA 4 + + +/** + * struct media - TIPC media information available to internal users + * @send_msg: routine which handles buffer transmission + * @enable_bearer: routine which enables a bearer + * @disable_bearer: routine which disables a bearer + * @addr2str: routine which converts bearer's address to string form + * @bcast_addr: media address used in broadcasting + * @bcast: non-zero if media supports broadcasting [currently mandatory] + * @priority: default link (and bearer) priority + * @tolerance: default time (in ms) before declaring link failure + * @window: default window (in packets) before declaring link congestion + * @type_id: TIPC media identifier [defined in tipc_bearer.h] + * @name: media name + */ + +struct media { + int (*send_msg)(struct sk_buff *buf, + struct tipc_bearer *b_ptr, + struct tipc_media_addr *dest); + int (*enable_bearer)(struct tipc_bearer *b_ptr); + void (*disable_bearer)(struct tipc_bearer *b_ptr); + char *(*addr2str)(struct tipc_media_addr *a, + char *str_buf, int str_size); + struct tipc_media_addr bcast_addr; + int bcast; + u32 priority; + u32 tolerance; + u32 window; + u32 type_id; + char name[TIPC_MAX_MEDIA_NAME]; +}; + +/** + * struct bearer - TIPC bearer information available to internal users + * @publ: bearer information available to privileged users + * @media: ptr to media structure associated with bearer + * @priority: default link priority for bearer + * @detect_scope: network address mask used during automatic link creation + * @identity: array index of this bearer within TIPC bearer array + * @link_req: ptr to (optional) structure making periodic link setup requests + * @links: list of non-congested links associated with bearer + * @cong_links: list of congested links associated with bearer + * @continue_count: # of times bearer has resumed after congestion or blocking + * @active: non-zero if bearer structure is represents a bearer + * @net_plane: network plane ('A' through 'H') currently associated with bearer + * @nodes: indicates which nodes in cluster can be reached through bearer + */ + +struct bearer { + struct tipc_bearer publ; + struct media *media; + u32 priority; + u32 detect_scope; + u32 identity; + struct link_req *link_req; + struct list_head links; + struct list_head cong_links; + u32 continue_count; + int active; + char net_plane; + struct node_map nodes; +}; + +struct bearer_name { + char media_name[TIPC_MAX_MEDIA_NAME]; + char if_name[TIPC_MAX_IF_NAME]; +}; + +struct link; + +extern struct bearer *bearers; + +void media_addr_printf(struct print_buf *pb, struct tipc_media_addr *a); +struct sk_buff *media_get_names(void); + +struct sk_buff *bearer_get_names(void); +void bearer_add_dest(struct bearer *b_ptr, u32 dest); +void bearer_remove_dest(struct bearer *b_ptr, u32 dest); +void bearer_schedule(struct bearer *b_ptr, struct link *l_ptr); +struct bearer *bearer_find_interface(const char *if_name); +int bearer_resolve_congestion(struct bearer *b_ptr, struct link *l_ptr); +int bearer_init(void); +void bearer_stop(void); +int bearer_broadcast(struct sk_buff *buf, struct tipc_bearer *b_ptr, + struct tipc_media_addr *dest); +void bearer_lock_push(struct bearer *b_ptr); + + +/** + * bearer_send- sends buffer to destination over bearer + * + * Returns true (1) if successful, or false (0) if unable to send + * + * IMPORTANT: + * The media send routine must not alter the buffer being passed in + * as it may be needed for later retransmission! + * + * If the media send routine returns a non-zero value (indicating that + * it was unable to send the buffer), it must: + * 1) mark the bearer as blocked, + * 2) call tipc_continue() once the bearer is able to send again. + * Media types that are unable to meet these two critera must ensure their + * send routine always returns success -- even if the buffer was not sent -- + * and let TIPC's link code deal with the undelivered message. + */ + +static inline int bearer_send(struct bearer *b_ptr, struct sk_buff *buf, + struct tipc_media_addr *dest) +{ + return !b_ptr->media->send_msg(buf, &b_ptr->publ, dest); +} + +/** + * bearer_congested - determines if bearer is currently congested + */ + +static inline int bearer_congested(struct bearer *b_ptr, struct link *l_ptr) +{ + if (unlikely(b_ptr->publ.blocked)) + return 1; + if (likely(list_empty(&b_ptr->cong_links))) + return 0; + return !bearer_resolve_congestion(b_ptr, l_ptr); +} + +#endif diff --git a/net/tipc/cluster.c b/net/tipc/cluster.c new file mode 100644 index 000000000000..f0f7bac51d41 --- /dev/null +++ b/net/tipc/cluster.c @@ -0,0 +1,576 @@ +/* + * net/tipc/cluster.c: TIPC cluster management routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "cluster.h" +#include "addr.h" +#include "node_subscr.h" +#include "link.h" +#include "node.h" +#include "net.h" +#include "msg.h" +#include "bearer.h" + +void cluster_multicast(struct cluster *c_ptr, struct sk_buff *buf, + u32 lower, u32 upper); +struct sk_buff *cluster_prepare_routing_msg(u32 data_size, u32 dest); + +struct node **local_nodes = 0; +struct node_map cluster_bcast_nodes = {0,{0,}}; +u32 highest_allowed_slave = 0; + +struct cluster *cluster_create(u32 addr) +{ + struct _zone *z_ptr; + struct cluster *c_ptr; + int max_nodes; + int alloc; + + c_ptr = (struct cluster *)kmalloc(sizeof(*c_ptr), GFP_ATOMIC); + if (c_ptr == NULL) + return 0; + memset(c_ptr, 0, sizeof(*c_ptr)); + + c_ptr->addr = tipc_addr(tipc_zone(addr), tipc_cluster(addr), 0); + if (in_own_cluster(addr)) + max_nodes = LOWEST_SLAVE + tipc_max_slaves; + else + max_nodes = tipc_max_nodes + 1; + alloc = sizeof(void *) * (max_nodes + 1); + c_ptr->nodes = (struct node **)kmalloc(alloc, GFP_ATOMIC); + if (c_ptr->nodes == NULL) { + kfree(c_ptr); + return 0; + } + memset(c_ptr->nodes, 0, alloc); + if (in_own_cluster(addr)) + local_nodes = c_ptr->nodes; + c_ptr->highest_slave = LOWEST_SLAVE - 1; + c_ptr->highest_node = 0; + + z_ptr = zone_find(tipc_zone(addr)); + if (z_ptr == NULL) { + z_ptr = zone_create(addr); + } + if (z_ptr != NULL) { + zone_attach_cluster(z_ptr, c_ptr); + c_ptr->owner = z_ptr; + } + else { + kfree(c_ptr); + c_ptr = 0; + } + + return c_ptr; +} + +void cluster_delete(struct cluster *c_ptr) +{ + u32 n_num; + + if (!c_ptr) + return; + for (n_num = 1; n_num <= c_ptr->highest_node; n_num++) { + node_delete(c_ptr->nodes[n_num]); + } + for (n_num = LOWEST_SLAVE; n_num <= c_ptr->highest_slave; n_num++) { + node_delete(c_ptr->nodes[n_num]); + } + kfree(c_ptr->nodes); + kfree(c_ptr); +} + +u32 cluster_next_node(struct cluster *c_ptr, u32 addr) +{ + struct node *n_ptr; + u32 n_num = tipc_node(addr) + 1; + + if (!c_ptr) + return addr; + for (; n_num <= c_ptr->highest_node; n_num++) { + n_ptr = c_ptr->nodes[n_num]; + if (n_ptr && node_has_active_links(n_ptr)) + return n_ptr->addr; + } + for (n_num = 1; n_num < tipc_node(addr); n_num++) { + n_ptr = c_ptr->nodes[n_num]; + if (n_ptr && node_has_active_links(n_ptr)) + return n_ptr->addr; + } + return 0; +} + +void cluster_attach_node(struct cluster *c_ptr, struct node *n_ptr) +{ + u32 n_num = tipc_node(n_ptr->addr); + u32 max_n_num = tipc_max_nodes; + + if (in_own_cluster(n_ptr->addr)) + max_n_num = highest_allowed_slave; + assert(n_num > 0); + assert(n_num <= max_n_num); + assert(c_ptr->nodes[n_num] == 0); + c_ptr->nodes[n_num] = n_ptr; + if (n_num > c_ptr->highest_node) + c_ptr->highest_node = n_num; +} + +/** + * cluster_select_router - select router to a cluster + * + * Uses deterministic and fair algorithm. + */ + +u32 cluster_select_router(struct cluster *c_ptr, u32 ref) +{ + u32 n_num; + u32 ulim = c_ptr->highest_node; + u32 mask; + u32 tstart; + + assert(!in_own_cluster(c_ptr->addr)); + if (!ulim) + return 0; + + /* Start entry must be random */ + mask = tipc_max_nodes; + while (mask > ulim) + mask >>= 1; + tstart = ref & mask; + n_num = tstart; + + /* Lookup upwards with wrap-around */ + do { + if (node_is_up(c_ptr->nodes[n_num])) + break; + } while (++n_num <= ulim); + if (n_num > ulim) { + n_num = 1; + do { + if (node_is_up(c_ptr->nodes[n_num])) + break; + } while (++n_num < tstart); + if (n_num == tstart) + return 0; + } + assert(n_num <= ulim); + return node_select_router(c_ptr->nodes[n_num], ref); +} + +/** + * cluster_select_node - select destination node within a remote cluster + * + * Uses deterministic and fair algorithm. + */ + +struct node *cluster_select_node(struct cluster *c_ptr, u32 selector) +{ + u32 n_num; + u32 mask = tipc_max_nodes; + u32 start_entry; + + assert(!in_own_cluster(c_ptr->addr)); + if (!c_ptr->highest_node) + return 0; + + /* Start entry must be random */ + while (mask > c_ptr->highest_node) { + mask >>= 1; + } + start_entry = (selector & mask) ? selector & mask : 1u; + assert(start_entry <= c_ptr->highest_node); + + /* Lookup upwards with wrap-around */ + for (n_num = start_entry; n_num <= c_ptr->highest_node; n_num++) { + if (node_has_active_links(c_ptr->nodes[n_num])) + return c_ptr->nodes[n_num]; + } + for (n_num = 1; n_num < start_entry; n_num++) { + if (node_has_active_links(c_ptr->nodes[n_num])) + return c_ptr->nodes[n_num]; + } + return 0; +} + +/* + * Routing table management: See description in node.c + */ + +struct sk_buff *cluster_prepare_routing_msg(u32 data_size, u32 dest) +{ + u32 size = INT_H_SIZE + data_size; + struct sk_buff *buf = buf_acquire(size); + struct tipc_msg *msg; + + if (buf) { + msg = buf_msg(buf); + memset((char *)msg, 0, size); + msg_init(msg, ROUTE_DISTRIBUTOR, 0, TIPC_OK, INT_H_SIZE, dest); + } + return buf; +} + +void cluster_bcast_new_route(struct cluster *c_ptr, u32 dest, + u32 lower, u32 upper) +{ + struct sk_buff *buf = cluster_prepare_routing_msg(0, c_ptr->addr); + struct tipc_msg *msg; + + if (buf) { + msg = buf_msg(buf); + msg_set_remote_node(msg, dest); + msg_set_type(msg, ROUTE_ADDITION); + cluster_multicast(c_ptr, buf, lower, upper); + } else { + warn("Memory squeeze: broadcast of new route failed\n"); + } +} + +void cluster_bcast_lost_route(struct cluster *c_ptr, u32 dest, + u32 lower, u32 upper) +{ + struct sk_buff *buf = cluster_prepare_routing_msg(0, c_ptr->addr); + struct tipc_msg *msg; + + if (buf) { + msg = buf_msg(buf); + msg_set_remote_node(msg, dest); + msg_set_type(msg, ROUTE_REMOVAL); + cluster_multicast(c_ptr, buf, lower, upper); + } else { + warn("Memory squeeze: broadcast of lost route failed\n"); + } +} + +void cluster_send_slave_routes(struct cluster *c_ptr, u32 dest) +{ + struct sk_buff *buf; + struct tipc_msg *msg; + u32 highest = c_ptr->highest_slave; + u32 n_num; + int send = 0; + + assert(!is_slave(dest)); + assert(in_own_cluster(dest)); + assert(in_own_cluster(c_ptr->addr)); + if (highest <= LOWEST_SLAVE) + return; + buf = cluster_prepare_routing_msg(highest - LOWEST_SLAVE + 1, + c_ptr->addr); + if (buf) { + msg = buf_msg(buf); + msg_set_remote_node(msg, c_ptr->addr); + msg_set_type(msg, SLAVE_ROUTING_TABLE); + for (n_num = LOWEST_SLAVE; n_num <= highest; n_num++) { + if (c_ptr->nodes[n_num] && + node_has_active_links(c_ptr->nodes[n_num])) { + send = 1; + msg_set_dataoctet(msg, n_num); + } + } + if (send) + link_send(buf, dest, dest); + else + buf_discard(buf); + } else { + warn("Memory squeeze: broadcast of lost route failed\n"); + } +} + +void cluster_send_ext_routes(struct cluster *c_ptr, u32 dest) +{ + struct sk_buff *buf; + struct tipc_msg *msg; + u32 highest = c_ptr->highest_node; + u32 n_num; + int send = 0; + + if (in_own_cluster(c_ptr->addr)) + return; + assert(!is_slave(dest)); + assert(in_own_cluster(dest)); + highest = c_ptr->highest_node; + buf = cluster_prepare_routing_msg(highest + 1, c_ptr->addr); + if (buf) { + msg = buf_msg(buf); + msg_set_remote_node(msg, c_ptr->addr); + msg_set_type(msg, EXT_ROUTING_TABLE); + for (n_num = 1; n_num <= highest; n_num++) { + if (c_ptr->nodes[n_num] && + node_has_active_links(c_ptr->nodes[n_num])) { + send = 1; + msg_set_dataoctet(msg, n_num); + } + } + if (send) + link_send(buf, dest, dest); + else + buf_discard(buf); + } else { + warn("Memory squeeze: broadcast of external route failed\n"); + } +} + +void cluster_send_local_routes(struct cluster *c_ptr, u32 dest) +{ + struct sk_buff *buf; + struct tipc_msg *msg; + u32 highest = c_ptr->highest_node; + u32 n_num; + int send = 0; + + assert(is_slave(dest)); + assert(in_own_cluster(c_ptr->addr)); + buf = cluster_prepare_routing_msg(highest, c_ptr->addr); + if (buf) { + msg = buf_msg(buf); + msg_set_remote_node(msg, c_ptr->addr); + msg_set_type(msg, LOCAL_ROUTING_TABLE); + for (n_num = 1; n_num <= highest; n_num++) { + if (c_ptr->nodes[n_num] && + node_has_active_links(c_ptr->nodes[n_num])) { + send = 1; + msg_set_dataoctet(msg, n_num); + } + } + if (send) + link_send(buf, dest, dest); + else + buf_discard(buf); + } else { + warn("Memory squeeze: broadcast of local route failed\n"); + } +} + +void cluster_recv_routing_table(struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + struct cluster *c_ptr; + struct node *n_ptr; + unchar *node_table; + u32 table_size; + u32 router; + u32 rem_node = msg_remote_node(msg); + u32 z_num; + u32 c_num; + u32 n_num; + + c_ptr = cluster_find(rem_node); + if (!c_ptr) { + c_ptr = cluster_create(rem_node); + if (!c_ptr) { + buf_discard(buf); + return; + } + } + + node_table = buf->data + msg_hdr_sz(msg); + table_size = msg_size(msg) - msg_hdr_sz(msg); + router = msg_prevnode(msg); + z_num = tipc_zone(rem_node); + c_num = tipc_cluster(rem_node); + + switch (msg_type(msg)) { + case LOCAL_ROUTING_TABLE: + assert(is_slave(tipc_own_addr)); + case EXT_ROUTING_TABLE: + for (n_num = 1; n_num < table_size; n_num++) { + if (node_table[n_num]) { + u32 addr = tipc_addr(z_num, c_num, n_num); + n_ptr = c_ptr->nodes[n_num]; + if (!n_ptr) { + n_ptr = node_create(addr); + } + if (n_ptr) + node_add_router(n_ptr, router); + } + } + break; + case SLAVE_ROUTING_TABLE: + assert(!is_slave(tipc_own_addr)); + assert(in_own_cluster(c_ptr->addr)); + for (n_num = 1; n_num < table_size; n_num++) { + if (node_table[n_num]) { + u32 slave_num = n_num + LOWEST_SLAVE; + u32 addr = tipc_addr(z_num, c_num, slave_num); + n_ptr = c_ptr->nodes[slave_num]; + if (!n_ptr) { + n_ptr = node_create(addr); + } + if (n_ptr) + node_add_router(n_ptr, router); + } + } + break; + case ROUTE_ADDITION: + if (!is_slave(tipc_own_addr)) { + assert(!in_own_cluster(c_ptr->addr) + || is_slave(rem_node)); + } else { + assert(in_own_cluster(c_ptr->addr) + && !is_slave(rem_node)); + } + n_ptr = c_ptr->nodes[tipc_node(rem_node)]; + if (!n_ptr) + n_ptr = node_create(rem_node); + if (n_ptr) + node_add_router(n_ptr, router); + break; + case ROUTE_REMOVAL: + if (!is_slave(tipc_own_addr)) { + assert(!in_own_cluster(c_ptr->addr) + || is_slave(rem_node)); + } else { + assert(in_own_cluster(c_ptr->addr) + && !is_slave(rem_node)); + } + n_ptr = c_ptr->nodes[tipc_node(rem_node)]; + if (n_ptr) + node_remove_router(n_ptr, router); + break; + default: + assert(!"Illegal routing manager message received\n"); + } + buf_discard(buf); +} + +void cluster_remove_as_router(struct cluster *c_ptr, u32 router) +{ + u32 start_entry; + u32 tstop; + u32 n_num; + + if (is_slave(router)) + return; /* Slave nodes can not be routers */ + + if (in_own_cluster(c_ptr->addr)) { + start_entry = LOWEST_SLAVE; + tstop = c_ptr->highest_slave; + } else { + start_entry = 1; + tstop = c_ptr->highest_node; + } + + for (n_num = start_entry; n_num <= tstop; n_num++) { + if (c_ptr->nodes[n_num]) { + node_remove_router(c_ptr->nodes[n_num], router); + } + } +} + +/** + * cluster_multicast - multicast message to local nodes + */ + +void cluster_multicast(struct cluster *c_ptr, struct sk_buff *buf, + u32 lower, u32 upper) +{ + struct sk_buff *buf_copy; + struct node *n_ptr; + u32 n_num; + u32 tstop; + + assert(lower <= upper); + assert(((lower >= 1) && (lower <= tipc_max_nodes)) || + ((lower >= LOWEST_SLAVE) && (lower <= highest_allowed_slave))); + assert(((upper >= 1) && (upper <= tipc_max_nodes)) || + ((upper >= LOWEST_SLAVE) && (upper <= highest_allowed_slave))); + assert(in_own_cluster(c_ptr->addr)); + + tstop = is_slave(upper) ? c_ptr->highest_slave : c_ptr->highest_node; + if (tstop > upper) + tstop = upper; + for (n_num = lower; n_num <= tstop; n_num++) { + n_ptr = c_ptr->nodes[n_num]; + if (n_ptr && node_has_active_links(n_ptr)) { + buf_copy = skb_copy(buf, GFP_ATOMIC); + if (buf_copy == NULL) + break; + msg_set_destnode(buf_msg(buf_copy), n_ptr->addr); + link_send(buf_copy, n_ptr->addr, n_ptr->addr); + } + } + buf_discard(buf); +} + +/** + * cluster_broadcast - broadcast message to all nodes within cluster + */ + +void cluster_broadcast(struct sk_buff *buf) +{ + struct sk_buff *buf_copy; + struct cluster *c_ptr; + struct node *n_ptr; + u32 n_num; + u32 tstart; + u32 tstop; + u32 node_type; + + if (tipc_mode == TIPC_NET_MODE) { + c_ptr = cluster_find(tipc_own_addr); + assert(in_own_cluster(c_ptr->addr)); /* For now */ + + /* Send to standard nodes, then repeat loop sending to slaves */ + tstart = 1; + tstop = c_ptr->highest_node; + for (node_type = 1; node_type <= 2; node_type++) { + for (n_num = tstart; n_num <= tstop; n_num++) { + n_ptr = c_ptr->nodes[n_num]; + if (n_ptr && node_has_active_links(n_ptr)) { + buf_copy = skb_copy(buf, GFP_ATOMIC); + if (buf_copy == NULL) + goto exit; + msg_set_destnode(buf_msg(buf_copy), + n_ptr->addr); + link_send(buf_copy, n_ptr->addr, + n_ptr->addr); + } + } + tstart = LOWEST_SLAVE; + tstop = c_ptr->highest_slave; + } + } +exit: + buf_discard(buf); +} + +int cluster_init(void) +{ + highest_allowed_slave = LOWEST_SLAVE + tipc_max_slaves; + return cluster_create(tipc_own_addr) ? TIPC_OK : -ENOMEM; +} + diff --git a/net/tipc/cluster.h b/net/tipc/cluster.h new file mode 100644 index 000000000000..1ffb095991df --- /dev/null +++ b/net/tipc/cluster.h @@ -0,0 +1,92 @@ +/* + * net/tipc/cluster.h: Include file for TIPC cluster management routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_CLUSTER_H +#define _TIPC_CLUSTER_H + +#include "addr.h" +#include "zone.h" + +#define LOWEST_SLAVE 2048u + +/** + * struct cluster - TIPC cluster structure + * @addr: network address of cluster + * @owner: pointer to zone that cluster belongs to + * @nodes: array of pointers to all nodes within cluster + * @highest_node: id of highest numbered node within cluster + * @highest_slave: (used for secondary node support) + */ + +struct cluster { + u32 addr; + struct _zone *owner; + struct node **nodes; + u32 highest_node; + u32 highest_slave; +}; + + +extern struct node **local_nodes; +extern u32 highest_allowed_slave; +extern struct node_map cluster_bcast_nodes; + +void cluster_remove_as_router(struct cluster *c_ptr, u32 router); +void cluster_send_ext_routes(struct cluster *c_ptr, u32 dest); +struct node *cluster_select_node(struct cluster *c_ptr, u32 selector); +u32 cluster_select_router(struct cluster *c_ptr, u32 ref); +void cluster_recv_routing_table(struct sk_buff *buf); +struct cluster *cluster_create(u32 addr); +void cluster_delete(struct cluster *c_ptr); +void cluster_attach_node(struct cluster *c_ptr, struct node *n_ptr); +void cluster_send_slave_routes(struct cluster *c_ptr, u32 dest); +void cluster_broadcast(struct sk_buff *buf); +int cluster_init(void); +u32 cluster_next_node(struct cluster *c_ptr, u32 addr); +void cluster_bcast_new_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi); +void cluster_send_local_routes(struct cluster *c_ptr, u32 dest); +void cluster_bcast_lost_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi); + +static inline struct cluster *cluster_find(u32 addr) +{ + struct _zone *z_ptr = zone_find(addr); + + if (z_ptr) + return z_ptr->clusters[1]; + return 0; +} + +#endif diff --git a/net/tipc/config.c b/net/tipc/config.c new file mode 100644 index 000000000000..8ddef4fce2c2 --- /dev/null +++ b/net/tipc/config.c @@ -0,0 +1,718 @@ +/* + * net/tipc/config.c: TIPC configuration management code + * + * Copyright (c) 2002-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "dbg.h" +#include "bearer.h" +#include "port.h" +#include "link.h" +#include "zone.h" +#include "addr.h" +#include "name_table.h" +#include "node.h" +#include "config.h" +#include "discover.h" + +struct subscr_data { + char usr_handle[8]; + u32 domain; + u32 port_ref; + struct list_head subd_list; +}; + +struct manager { + u32 user_ref; + u32 port_ref; + u32 subscr_ref; + u32 link_subscriptions; + struct list_head link_subscribers; +}; + +static struct manager mng = { 0}; + +static spinlock_t config_lock = SPIN_LOCK_UNLOCKED; + +static const void *req_tlv_area; /* request message TLV area */ +static int req_tlv_space; /* request message TLV area size */ +static int rep_headroom; /* reply message headroom to use */ + + +void cfg_link_event(u32 addr, char *name, int up) +{ + /* TIPC DOESN'T HANDLE LINK EVENT SUBSCRIPTIONS AT THE MOMENT */ +} + + +struct sk_buff *cfg_reply_alloc(int payload_size) +{ + struct sk_buff *buf; + + buf = alloc_skb(rep_headroom + payload_size, GFP_ATOMIC); + if (buf) + skb_reserve(buf, rep_headroom); + return buf; +} + +int cfg_append_tlv(struct sk_buff *buf, int tlv_type, + void *tlv_data, int tlv_data_size) +{ + struct tlv_desc *tlv = (struct tlv_desc *)buf->tail; + int new_tlv_space = TLV_SPACE(tlv_data_size); + + if (skb_tailroom(buf) < new_tlv_space) { + dbg("cfg_append_tlv unable to append TLV\n"); + return 0; + } + skb_put(buf, new_tlv_space); + tlv->tlv_type = htons(tlv_type); + tlv->tlv_len = htons(TLV_LENGTH(tlv_data_size)); + if (tlv_data_size && tlv_data) + memcpy(TLV_DATA(tlv), tlv_data, tlv_data_size); + return 1; +} + +struct sk_buff *cfg_reply_unsigned_type(u16 tlv_type, u32 value) +{ + struct sk_buff *buf; + u32 value_net; + + buf = cfg_reply_alloc(TLV_SPACE(sizeof(value))); + if (buf) { + value_net = htonl(value); + cfg_append_tlv(buf, tlv_type, &value_net, + sizeof(value_net)); + } + return buf; +} + +struct sk_buff *cfg_reply_string_type(u16 tlv_type, char *string) +{ + struct sk_buff *buf; + int string_len = strlen(string) + 1; + + buf = cfg_reply_alloc(TLV_SPACE(string_len)); + if (buf) + cfg_append_tlv(buf, tlv_type, string, string_len); + return buf; +} + + + + +#if 0 + +/* Now obsolete code for handling commands not yet implemented the new way */ + +int tipc_cfg_cmd(const struct tipc_cmd_msg * msg, + char *data, + u32 sz, + u32 *ret_size, + struct tipc_portid *orig) +{ + int rv = -EINVAL; + u32 cmd = msg->cmd; + + *ret_size = 0; + switch (cmd) { + case TIPC_REMOVE_LINK: + case TIPC_CMD_BLOCK_LINK: + case TIPC_CMD_UNBLOCK_LINK: + if (!cfg_check_connection(orig)) + rv = link_control(msg->argv.link_name, msg->cmd, 0); + break; + case TIPC_ESTABLISH: + { + int connected; + + tipc_isconnected(mng.conn_port_ref, &connected); + if (connected || !orig) { + rv = TIPC_FAILURE; + break; + } + rv = tipc_connect2port(mng.conn_port_ref, orig); + if (rv == TIPC_OK) + orig = 0; + break; + } + case TIPC_GET_PEER_ADDRESS: + *ret_size = link_peer_addr(msg->argv.link_name, data, sz); + break; + case TIPC_GET_ROUTES: + rv = TIPC_OK; + break; + default: {} + } + if (*ret_size) + rv = TIPC_OK; + return rv; +} + +static void cfg_cmd_event(struct tipc_cmd_msg *msg, + char *data, + u32 sz, + struct tipc_portid const *orig) +{ + int rv = -EINVAL; + struct tipc_cmd_result_msg rmsg; + struct iovec msg_sect[2]; + int *arg; + + msg->cmd = ntohl(msg->cmd); + + cfg_prepare_res_msg(msg->cmd, msg->usr_handle, rv, &rmsg, msg_sect, + data, 0); + if (ntohl(msg->magic) != TIPC_MAGIC) + goto exit; + + switch (msg->cmd) { + case TIPC_CREATE_LINK: + if (!cfg_check_connection(orig)) + rv = disc_create_link(&msg->argv.create_link); + break; + case TIPC_LINK_SUBSCRIBE: + { + struct subscr_data *sub; + + if (mng.link_subscriptions > 64) + break; + sub = (struct subscr_data *)kmalloc(sizeof(*sub), + GFP_ATOMIC); + if (sub == NULL) { + warn("Memory squeeze; dropped remote link subscription\n"); + break; + } + INIT_LIST_HEAD(&sub->subd_list); + tipc_createport(mng.user_ref, + (void *)sub, + TIPC_HIGH_IMPORTANCE, + 0, + 0, + (tipc_conn_shutdown_event)cfg_linksubscr_cancel, + 0, + 0, + (tipc_conn_msg_event)cfg_linksubscr_cancel, + 0, + &sub->port_ref); + if (!sub->port_ref) { + kfree(sub); + break; + } + memcpy(sub->usr_handle,msg->usr_handle, + sizeof(sub->usr_handle)); + sub->domain = msg->argv.domain; + list_add_tail(&sub->subd_list, &mng.link_subscribers); + tipc_connect2port(sub->port_ref, orig); + rmsg.retval = TIPC_OK; + tipc_send(sub->port_ref, 2u, msg_sect); + mng.link_subscriptions++; + return; + } + default: + rv = tipc_cfg_cmd(msg, data, sz, (u32 *)&msg_sect[1].iov_len, orig); + } + exit: + rmsg.result_len = htonl(msg_sect[1].iov_len); + rmsg.retval = htonl(rv); + cfg_respond(msg_sect, 2u, orig); +} +#endif + +static struct sk_buff *cfg_enable_bearer(void) +{ + struct tipc_bearer_config *args; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_BEARER_CONFIG)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + args = (struct tipc_bearer_config *)TLV_DATA(req_tlv_area); + if (tipc_enable_bearer(args->name, + ntohl(args->detect_scope), + ntohl(args->priority))) + return cfg_reply_error_string("unable to enable bearer"); + + return cfg_reply_none(); +} + +static struct sk_buff *cfg_disable_bearer(void) +{ + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_BEARER_NAME)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + if (tipc_disable_bearer((char *)TLV_DATA(req_tlv_area))) + return cfg_reply_error_string("unable to disable bearer"); + + return cfg_reply_none(); +} + +static struct sk_buff *cfg_set_own_addr(void) +{ + u32 addr; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + addr = *(u32 *)TLV_DATA(req_tlv_area); + addr = ntohl(addr); + if (addr == tipc_own_addr) + return cfg_reply_none(); + if (!addr_node_valid(addr)) + return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (node address)"); + if (tipc_own_addr) + return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot change node address once assigned)"); + + spin_unlock_bh(&config_lock); + stop_net(); + tipc_own_addr = addr; + start_net(); + spin_lock_bh(&config_lock); + return cfg_reply_none(); +} + +static struct sk_buff *cfg_set_remote_mng(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + value = *(u32 *)TLV_DATA(req_tlv_area); + value = ntohl(value); + tipc_remote_management = (value != 0); + return cfg_reply_none(); +} + +static struct sk_buff *cfg_set_max_publications(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + value = *(u32 *)TLV_DATA(req_tlv_area); + value = ntohl(value); + if (value != delimit(value, 1, 65535)) + return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (max publications must be 1-65535)"); + tipc_max_publications = value; + return cfg_reply_none(); +} + +static struct sk_buff *cfg_set_max_subscriptions(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + value = *(u32 *)TLV_DATA(req_tlv_area); + value = ntohl(value); + if (value != delimit(value, 1, 65535)) + return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (max subscriptions must be 1-65535"); + tipc_max_subscriptions = value; + return cfg_reply_none(); +} + +static struct sk_buff *cfg_set_max_ports(void) +{ + int orig_mode; + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + value = *(u32 *)TLV_DATA(req_tlv_area); + value = ntohl(value); + if (value != delimit(value, 127, 65535)) + return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (max ports must be 127-65535)"); + + if (value == tipc_max_ports) + return cfg_reply_none(); + + if (atomic_read(&tipc_user_count) > 2) + return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot change max ports while TIPC users exist)"); + + spin_unlock_bh(&config_lock); + orig_mode = tipc_get_mode(); + if (orig_mode == TIPC_NET_MODE) + stop_net(); + stop_core(); + tipc_max_ports = value; + start_core(); + if (orig_mode == TIPC_NET_MODE) + start_net(); + spin_lock_bh(&config_lock); + return cfg_reply_none(); +} + +static struct sk_buff *set_net_max(int value, int *parameter) +{ + int orig_mode; + + if (value != *parameter) { + orig_mode = tipc_get_mode(); + if (orig_mode == TIPC_NET_MODE) + stop_net(); + *parameter = value; + if (orig_mode == TIPC_NET_MODE) + start_net(); + } + + return cfg_reply_none(); +} + +static struct sk_buff *cfg_set_max_zones(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + value = *(u32 *)TLV_DATA(req_tlv_area); + value = ntohl(value); + if (value != delimit(value, 1, 255)) + return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (max zones must be 1-255)"); + return set_net_max(value, &tipc_max_zones); +} + +static struct sk_buff *cfg_set_max_clusters(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + value = *(u32 *)TLV_DATA(req_tlv_area); + value = ntohl(value); + if (value != 1) + return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (max clusters fixed at 1)"); + return cfg_reply_none(); +} + +static struct sk_buff *cfg_set_max_nodes(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + value = *(u32 *)TLV_DATA(req_tlv_area); + value = ntohl(value); + if (value != delimit(value, 8, 2047)) + return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (max nodes must be 8-2047)"); + return set_net_max(value, &tipc_max_nodes); +} + +static struct sk_buff *cfg_set_max_slaves(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + value = *(u32 *)TLV_DATA(req_tlv_area); + value = ntohl(value); + if (value != 0) + return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (max secondary nodes fixed at 0)"); + return cfg_reply_none(); +} + +static struct sk_buff *cfg_set_netid(void) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + value = *(u32 *)TLV_DATA(req_tlv_area); + value = ntohl(value); + if (value != delimit(value, 1, 9999)) + return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (network id must be 1-9999)"); + + if (tipc_own_addr) + return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot change network id once part of network)"); + + return set_net_max(value, &tipc_net_id); +} + +struct sk_buff *cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area, + int request_space, int reply_headroom) +{ + struct sk_buff *rep_tlv_buf; + + spin_lock_bh(&config_lock); + + /* Save request and reply details in a well-known location */ + + req_tlv_area = request_area; + req_tlv_space = request_space; + rep_headroom = reply_headroom; + + /* Check command authorization */ + + if (likely(orig_node == tipc_own_addr)) { + /* command is permitted */ + } else if (cmd >= 0x8000) { + rep_tlv_buf = cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot be done remotely)"); + goto exit; + } else if (!tipc_remote_management) { + rep_tlv_buf = cfg_reply_error_string(TIPC_CFG_NO_REMOTE); + goto exit; + } + else if (cmd >= 0x4000) { + u32 domain = 0; + + if ((nametbl_translate(TIPC_ZM_SRV, 0, &domain) == 0) || + (domain != orig_node)) { + rep_tlv_buf = cfg_reply_error_string(TIPC_CFG_NOT_ZONE_MSTR); + goto exit; + } + } + + /* Call appropriate processing routine */ + + switch (cmd) { + case TIPC_CMD_NOOP: + rep_tlv_buf = cfg_reply_none(); + break; + case TIPC_CMD_GET_NODES: + rep_tlv_buf = node_get_nodes(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_GET_LINKS: + rep_tlv_buf = node_get_links(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_SHOW_LINK_STATS: + rep_tlv_buf = link_cmd_show_stats(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_RESET_LINK_STATS: + rep_tlv_buf = link_cmd_reset_stats(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_SHOW_NAME_TABLE: + rep_tlv_buf = nametbl_get(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_GET_BEARER_NAMES: + rep_tlv_buf = bearer_get_names(); + break; + case TIPC_CMD_GET_MEDIA_NAMES: + rep_tlv_buf = media_get_names(); + break; + case TIPC_CMD_SHOW_PORTS: + rep_tlv_buf = port_get_ports(); + break; +#if 0 + case TIPC_CMD_SHOW_PORT_STATS: + rep_tlv_buf = port_show_stats(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_RESET_PORT_STATS: + rep_tlv_buf = cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED); + break; +#endif + case TIPC_CMD_SET_LOG_SIZE: + rep_tlv_buf = log_resize(req_tlv_area, req_tlv_space); + break; + case TIPC_CMD_DUMP_LOG: + rep_tlv_buf = log_dump(); + break; + case TIPC_CMD_SET_LINK_TOL: + case TIPC_CMD_SET_LINK_PRI: + case TIPC_CMD_SET_LINK_WINDOW: + rep_tlv_buf = link_cmd_config(req_tlv_area, req_tlv_space, cmd); + break; + case TIPC_CMD_ENABLE_BEARER: + rep_tlv_buf = cfg_enable_bearer(); + break; + case TIPC_CMD_DISABLE_BEARER: + rep_tlv_buf = cfg_disable_bearer(); + break; + case TIPC_CMD_SET_NODE_ADDR: + rep_tlv_buf = cfg_set_own_addr(); + break; + case TIPC_CMD_SET_REMOTE_MNG: + rep_tlv_buf = cfg_set_remote_mng(); + break; + case TIPC_CMD_SET_MAX_PORTS: + rep_tlv_buf = cfg_set_max_ports(); + break; + case TIPC_CMD_SET_MAX_PUBL: + rep_tlv_buf = cfg_set_max_publications(); + break; + case TIPC_CMD_SET_MAX_SUBSCR: + rep_tlv_buf = cfg_set_max_subscriptions(); + break; + case TIPC_CMD_SET_MAX_ZONES: + rep_tlv_buf = cfg_set_max_zones(); + break; + case TIPC_CMD_SET_MAX_CLUSTERS: + rep_tlv_buf = cfg_set_max_clusters(); + break; + case TIPC_CMD_SET_MAX_NODES: + rep_tlv_buf = cfg_set_max_nodes(); + break; + case TIPC_CMD_SET_MAX_SLAVES: + rep_tlv_buf = cfg_set_max_slaves(); + break; + case TIPC_CMD_SET_NETID: + rep_tlv_buf = cfg_set_netid(); + break; + case TIPC_CMD_GET_REMOTE_MNG: + rep_tlv_buf = cfg_reply_unsigned(tipc_remote_management); + break; + case TIPC_CMD_GET_MAX_PORTS: + rep_tlv_buf = cfg_reply_unsigned(tipc_max_ports); + break; + case TIPC_CMD_GET_MAX_PUBL: + rep_tlv_buf = cfg_reply_unsigned(tipc_max_publications); + break; + case TIPC_CMD_GET_MAX_SUBSCR: + rep_tlv_buf = cfg_reply_unsigned(tipc_max_subscriptions); + break; + case TIPC_CMD_GET_MAX_ZONES: + rep_tlv_buf = cfg_reply_unsigned(tipc_max_zones); + break; + case TIPC_CMD_GET_MAX_CLUSTERS: + rep_tlv_buf = cfg_reply_unsigned(tipc_max_clusters); + break; + case TIPC_CMD_GET_MAX_NODES: + rep_tlv_buf = cfg_reply_unsigned(tipc_max_nodes); + break; + case TIPC_CMD_GET_MAX_SLAVES: + rep_tlv_buf = cfg_reply_unsigned(tipc_max_slaves); + break; + case TIPC_CMD_GET_NETID: + rep_tlv_buf = cfg_reply_unsigned(tipc_net_id); + break; + default: + rep_tlv_buf = NULL; + break; + } + + /* Return reply buffer */ +exit: + spin_unlock_bh(&config_lock); + return rep_tlv_buf; +} + +static void cfg_named_msg_event(void *userdata, + u32 port_ref, + struct sk_buff **buf, + const unchar *msg, + u32 size, + u32 importance, + struct tipc_portid const *orig, + struct tipc_name_seq const *dest) +{ + struct tipc_cfg_msg_hdr *req_hdr; + struct tipc_cfg_msg_hdr *rep_hdr; + struct sk_buff *rep_buf; + + /* Validate configuration message header (ignore invalid message) */ + + req_hdr = (struct tipc_cfg_msg_hdr *)msg; + if ((size < sizeof(*req_hdr)) || + (size != TCM_ALIGN(ntohl(req_hdr->tcm_len))) || + (ntohs(req_hdr->tcm_flags) != TCM_F_REQUEST)) { + warn("discarded invalid configuration message\n"); + return; + } + + /* Generate reply for request (if can't, return request) */ + + rep_buf = cfg_do_cmd(orig->node, + ntohs(req_hdr->tcm_type), + msg + sizeof(*req_hdr), + size - sizeof(*req_hdr), + BUF_HEADROOM + MAX_H_SIZE + sizeof(*rep_hdr)); + if (rep_buf) { + skb_push(rep_buf, sizeof(*rep_hdr)); + rep_hdr = (struct tipc_cfg_msg_hdr *)rep_buf->data; + memcpy(rep_hdr, req_hdr, sizeof(*rep_hdr)); + rep_hdr->tcm_len = htonl(rep_buf->len); + rep_hdr->tcm_flags &= htons(~TCM_F_REQUEST); + } else { + rep_buf = *buf; + *buf = NULL; + } + + /* NEED TO ADD CODE TO HANDLE FAILED SEND (SUCH AS CONGESTION) */ + tipc_send_buf2port(port_ref, orig, rep_buf, rep_buf->len); +} + +int cfg_init(void) +{ + struct tipc_name_seq seq; + int res; + + memset(&mng, 0, sizeof(mng)); + INIT_LIST_HEAD(&mng.link_subscribers); + + res = tipc_attach(&mng.user_ref, 0, 0); + if (res) + goto failed; + + res = tipc_createport(mng.user_ref, 0, TIPC_CRITICAL_IMPORTANCE, + NULL, NULL, NULL, + NULL, cfg_named_msg_event, NULL, + NULL, &mng.port_ref); + if (res) + goto failed; + + seq.type = TIPC_CFG_SRV; + seq.lower = seq.upper = tipc_own_addr; + res = nametbl_publish_rsv(mng.port_ref, TIPC_ZONE_SCOPE, &seq); + if (res) + goto failed; + + return 0; + +failed: + err("Unable to create configuration service\n"); + tipc_detach(mng.user_ref); + mng.user_ref = 0; + return res; +} + +void cfg_stop(void) +{ + if (mng.user_ref) { + tipc_detach(mng.user_ref); + mng.user_ref = 0; + } +} diff --git a/net/tipc/config.h b/net/tipc/config.h new file mode 100644 index 000000000000..646377d40454 --- /dev/null +++ b/net/tipc/config.h @@ -0,0 +1,80 @@ +/* + * net/tipc/config.h: Include file for TIPC configuration service code + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_CONFIG_H +#define _TIPC_CONFIG_H + +/* ---------------------------------------------------------------------- */ + +#include <linux/tipc.h> +#include <linux/tipc_config.h> +#include "link.h" + +struct sk_buff *cfg_reply_alloc(int payload_size); +int cfg_append_tlv(struct sk_buff *buf, int tlv_type, + void *tlv_data, int tlv_data_size); +struct sk_buff *cfg_reply_unsigned_type(u16 tlv_type, u32 value); +struct sk_buff *cfg_reply_string_type(u16 tlv_type, char *string); + +static inline struct sk_buff *cfg_reply_none(void) +{ + return cfg_reply_alloc(0); +} + +static inline struct sk_buff *cfg_reply_unsigned(u32 value) +{ + return cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value); +} + +static inline struct sk_buff *cfg_reply_error_string(char *string) +{ + return cfg_reply_string_type(TIPC_TLV_ERROR_STRING, string); +} + +static inline struct sk_buff *cfg_reply_ultra_string(char *string) +{ + return cfg_reply_string_type(TIPC_TLV_ULTRA_STRING, string); +} + +struct sk_buff *cfg_do_cmd(u32 orig_node, u16 cmd, + const void *req_tlv_area, int req_tlv_space, + int headroom); + +void cfg_link_event(u32 addr, char *name, int up); +int cfg_init(void); +void cfg_stop(void); + +#endif diff --git a/net/tipc/core.c b/net/tipc/core.c new file mode 100644 index 000000000000..e83ac06e31ba --- /dev/null +++ b/net/tipc/core.c @@ -0,0 +1,285 @@ +/* + * net/tipc/core.c: TIPC module code + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/version.h> +#include <linux/random.h> + +#include "core.h" +#include "dbg.h" +#include "ref.h" +#include "net.h" +#include "user_reg.h" +#include "name_table.h" +#include "subscr.h" +#include "config.h" + +int eth_media_start(void); +void eth_media_stop(void); +int handler_start(void); +void handler_stop(void); +int socket_init(void); +void socket_stop(void); +int netlink_start(void); +void netlink_stop(void); + +#define MOD_NAME "tipc_start: " + +#ifndef CONFIG_TIPC_ZONES +#define CONFIG_TIPC_ZONES 3 +#endif + +#ifndef CONFIG_TIPC_CLUSTERS +#define CONFIG_TIPC_CLUSTERS 1 +#endif + +#ifndef CONFIG_TIPC_NODES +#define CONFIG_TIPC_NODES 255 +#endif + +#ifndef CONFIG_TIPC_SLAVE_NODES +#define CONFIG_TIPC_SLAVE_NODES 0 +#endif + +#ifndef CONFIG_TIPC_PORTS +#define CONFIG_TIPC_PORTS 8191 +#endif + +#ifndef CONFIG_TIPC_LOG +#define CONFIG_TIPC_LOG 0 +#endif + +/* global variables used by multiple sub-systems within TIPC */ + +int tipc_mode = TIPC_NOT_RUNNING; +int tipc_random; +atomic_t tipc_user_count = ATOMIC_INIT(0); + +const char tipc_alphabet[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_"; + +/* configurable TIPC parameters */ + +u32 tipc_own_addr; +int tipc_max_zones; +int tipc_max_clusters; +int tipc_max_nodes; +int tipc_max_slaves; +int tipc_max_ports; +int tipc_max_subscriptions; +int tipc_max_publications; +int tipc_net_id; +int tipc_remote_management; + + +int tipc_get_mode(void) +{ + return tipc_mode; +} + +/** + * stop_net - shut down TIPC networking sub-systems + */ + +void stop_net(void) +{ + eth_media_stop(); + tipc_stop_net(); +} + +/** + * start_net - start TIPC networking sub-systems + */ + +int start_net(void) +{ + int res; + + if ((res = tipc_start_net()) || + (res = eth_media_start())) { + stop_net(); + } + return res; +} + +/** + * stop_core - switch TIPC from SINGLE NODE to NOT RUNNING mode + */ + +void stop_core(void) +{ + if (tipc_mode != TIPC_NODE_MODE) + return; + + tipc_mode = TIPC_NOT_RUNNING; + + netlink_stop(); + handler_stop(); + cfg_stop(); + subscr_stop(); + reg_stop(); + nametbl_stop(); + ref_table_stop(); + socket_stop(); +} + +/** + * start_core - switch TIPC from NOT RUNNING to SINGLE NODE mode + */ + +int start_core(void) +{ + int res; + + if (tipc_mode != TIPC_NOT_RUNNING) + return -ENOPROTOOPT; + + get_random_bytes(&tipc_random, sizeof(tipc_random)); + tipc_mode = TIPC_NODE_MODE; + + if ((res = handler_start()) || + (res = ref_table_init(tipc_max_ports + tipc_max_subscriptions, + tipc_random)) || + (res = reg_start()) || + (res = nametbl_init()) || + (res = k_signal((Handler)subscr_start, 0)) || + (res = k_signal((Handler)cfg_init, 0)) || + (res = netlink_start()) || + (res = socket_init())) { + stop_core(); + } + return res; +} + + +static int __init tipc_init(void) +{ + int res; + + log_reinit(CONFIG_TIPC_LOG); + info("Activated (compiled " __DATE__ " " __TIME__ ")\n"); + + tipc_own_addr = 0; + tipc_remote_management = 1; + tipc_max_publications = 10000; + tipc_max_subscriptions = 2000; + tipc_max_ports = delimit(CONFIG_TIPC_PORTS, 127, 65536); + tipc_max_zones = delimit(CONFIG_TIPC_ZONES, 1, 511); + tipc_max_clusters = delimit(CONFIG_TIPC_CLUSTERS, 1, 1); + tipc_max_nodes = delimit(CONFIG_TIPC_NODES, 8, 2047); + tipc_max_slaves = delimit(CONFIG_TIPC_SLAVE_NODES, 0, 2047); + tipc_net_id = 4711; + + if ((res = start_core())) + err("Unable to start in single node mode\n"); + else + info("Started in single node mode\n"); + return res; +} + +static void __exit tipc_exit(void) +{ + stop_net(); + stop_core(); + info("Deactivated\n"); + log_stop(); +} + +module_init(tipc_init); +module_exit(tipc_exit); + +MODULE_DESCRIPTION("TIPC: Transparent Inter Process Communication"); +MODULE_LICENSE("Dual BSD/GPL"); + +/* Native TIPC API for kernel-space applications (see tipc.h) */ + +EXPORT_SYMBOL(tipc_attach); +EXPORT_SYMBOL(tipc_detach); +EXPORT_SYMBOL(tipc_get_addr); +EXPORT_SYMBOL(tipc_get_mode); +EXPORT_SYMBOL(tipc_createport); +EXPORT_SYMBOL(tipc_deleteport); +EXPORT_SYMBOL(tipc_ownidentity); +EXPORT_SYMBOL(tipc_portimportance); +EXPORT_SYMBOL(tipc_set_portimportance); +EXPORT_SYMBOL(tipc_portunreliable); +EXPORT_SYMBOL(tipc_set_portunreliable); +EXPORT_SYMBOL(tipc_portunreturnable); +EXPORT_SYMBOL(tipc_set_portunreturnable); +EXPORT_SYMBOL(tipc_publish); +EXPORT_SYMBOL(tipc_withdraw); +EXPORT_SYMBOL(tipc_connect2port); +EXPORT_SYMBOL(tipc_disconnect); +EXPORT_SYMBOL(tipc_shutdown); +EXPORT_SYMBOL(tipc_isconnected); +EXPORT_SYMBOL(tipc_peer); +EXPORT_SYMBOL(tipc_ref_valid); +EXPORT_SYMBOL(tipc_send); +EXPORT_SYMBOL(tipc_send_buf); +EXPORT_SYMBOL(tipc_send2name); +EXPORT_SYMBOL(tipc_forward2name); +EXPORT_SYMBOL(tipc_send_buf2name); +EXPORT_SYMBOL(tipc_forward_buf2name); +EXPORT_SYMBOL(tipc_send2port); +EXPORT_SYMBOL(tipc_forward2port); +EXPORT_SYMBOL(tipc_send_buf2port); +EXPORT_SYMBOL(tipc_forward_buf2port); +EXPORT_SYMBOL(tipc_multicast); +/* EXPORT_SYMBOL(tipc_multicast_buf); not available yet */ +EXPORT_SYMBOL(tipc_ispublished); +EXPORT_SYMBOL(tipc_available_nodes); + +/* TIPC API for external bearers (see tipc_bearer.h) */ + +EXPORT_SYMBOL(tipc_block_bearer); +EXPORT_SYMBOL(tipc_continue); +EXPORT_SYMBOL(tipc_disable_bearer); +EXPORT_SYMBOL(tipc_enable_bearer); +EXPORT_SYMBOL(tipc_recv_msg); +EXPORT_SYMBOL(tipc_register_media); + +/* TIPC API for external APIs (see tipc_port.h) */ + +EXPORT_SYMBOL(tipc_createport_raw); +EXPORT_SYMBOL(tipc_set_msg_option); +EXPORT_SYMBOL(tipc_reject_msg); +EXPORT_SYMBOL(tipc_send_buf_fast); +EXPORT_SYMBOL(tipc_acknowledge); +EXPORT_SYMBOL(tipc_get_port); +EXPORT_SYMBOL(tipc_get_handle); + diff --git a/net/tipc/core.h b/net/tipc/core.h new file mode 100644 index 000000000000..b69b60b2cc86 --- /dev/null +++ b/net/tipc/core.h @@ -0,0 +1,316 @@ +/* + * net/tipc/core.h: Include file for TIPC global declarations + * + * Copyright (c) 2005-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_CORE_H +#define _TIPC_CORE_H + +#include <net/tipc/tipc.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <asm/uaccess.h> +#include <linux/interrupt.h> +#include <asm/atomic.h> +#include <asm/hardirq.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/list.h> +#include <linux/vmalloc.h> + +/* + * TIPC debugging code + */ + +#define assert(i) BUG_ON(!(i)) + +struct tipc_msg; +extern struct print_buf *CONS, *LOG; +extern struct print_buf *TEE(struct print_buf *, struct print_buf *); +void msg_print(struct print_buf*,struct tipc_msg *,const char*); +void tipc_printf(struct print_buf *, const char *fmt, ...); +void tipc_dump(struct print_buf*,const char *fmt, ...); + +#ifdef CONFIG_TIPC_DEBUG + +/* + * TIPC debug support included: + * - system messages are printed to TIPC_OUTPUT print buffer + * - debug messages are printed to DBG_OUTPUT print buffer + */ + +#define err(fmt, arg...) tipc_printf(TIPC_OUTPUT, KERN_ERR "TIPC: " fmt, ## arg) +#define warn(fmt, arg...) tipc_printf(TIPC_OUTPUT, KERN_WARNING "TIPC: " fmt, ## arg) +#define info(fmt, arg...) tipc_printf(TIPC_OUTPUT, KERN_NOTICE "TIPC: " fmt, ## arg) + +#define dbg(fmt, arg...) do {if (DBG_OUTPUT) tipc_printf(DBG_OUTPUT, fmt, ## arg);} while(0) +#define msg_dbg(msg, txt) do {if (DBG_OUTPUT) msg_print(DBG_OUTPUT, msg, txt);} while(0) +#define dump(fmt, arg...) do {if (DBG_OUTPUT) tipc_dump(DBG_OUTPUT, fmt, ##arg);} while(0) + + +/* + * By default, TIPC_OUTPUT is defined to be system console and TIPC log buffer, + * while DBG_OUTPUT is the null print buffer. These defaults can be changed + * here, or on a per .c file basis, by redefining these symbols. The following + * print buffer options are available: + * + * NULL : Output to null print buffer (i.e. print nowhere) + * CONS : Output to system console + * LOG : Output to TIPC log buffer + * &buf : Output to user-defined buffer (struct print_buf *) + * TEE(&buf_a,&buf_b) : Output to two print buffers (eg. TEE(CONS,LOG) ) + */ + +#ifndef TIPC_OUTPUT +#define TIPC_OUTPUT TEE(CONS,LOG) +#endif + +#ifndef DBG_OUTPUT +#define DBG_OUTPUT NULL +#endif + +#else + +#ifndef DBG_OUTPUT +#define DBG_OUTPUT NULL +#endif + +/* + * TIPC debug support not included: + * - system messages are printed to system console + * - debug messages are not printed + */ + +#define err(fmt, arg...) printk(KERN_ERR "TIPC: " fmt , ## arg) +#define info(fmt, arg...) printk(KERN_INFO "TIPC: " fmt , ## arg) +#define warn(fmt, arg...) printk(KERN_WARNING "TIPC: " fmt , ## arg) + +#define dbg(fmt, arg...) do {} while (0) +#define msg_dbg(msg,txt) do {} while (0) +#define dump(fmt,arg...) do {} while (0) + +#endif + + +/* + * TIPC-specific error codes + */ + +#define ELINKCONG EAGAIN /* link congestion <=> resource unavailable */ + +/* + * Global configuration variables + */ + +extern u32 tipc_own_addr; +extern int tipc_max_zones; +extern int tipc_max_clusters; +extern int tipc_max_nodes; +extern int tipc_max_slaves; +extern int tipc_max_ports; +extern int tipc_max_subscriptions; +extern int tipc_max_publications; +extern int tipc_net_id; +extern int tipc_remote_management; + +/* + * Other global variables + */ + +extern int tipc_mode; +extern int tipc_random; +extern const char tipc_alphabet[]; +extern atomic_t tipc_user_count; + + +/* + * Routines available to privileged subsystems + */ + +extern int start_core(void); +extern void stop_core(void); +extern int start_net(void); +extern void stop_net(void); + +static inline int delimit(int val, int min, int max) +{ + if (val > max) + return max; + if (val < min) + return min; + return val; +} + + +/* + * TIPC timer and signal code + */ + +typedef void (*Handler) (unsigned long); + +u32 k_signal(Handler routine, unsigned long argument); + +/** + * k_init_timer - initialize a timer + * @timer: pointer to timer structure + * @routine: pointer to routine to invoke when timer expires + * @argument: value to pass to routine when timer expires + * + * Timer must be initialized before use (and terminated when no longer needed). + */ + +static inline void k_init_timer(struct timer_list *timer, Handler routine, + unsigned long argument) +{ + dbg("initializing timer %p\n", timer); + init_timer(timer); + timer->function = routine; + timer->data = argument; +} + +/** + * k_start_timer - start a timer + * @timer: pointer to timer structure + * @msec: time to delay (in ms) + * + * Schedules a previously initialized timer for later execution. + * If timer is already running, the new timeout overrides the previous request. + * + * To ensure the timer doesn't expire before the specified delay elapses, + * the amount of delay is rounded up when converting to the jiffies + * then an additional jiffy is added to account for the fact that + * the starting time may be in the middle of the current jiffy. + */ + +static inline void k_start_timer(struct timer_list *timer, unsigned long msec) +{ + dbg("starting timer %p for %u\n", timer, msec); + mod_timer(timer, jiffies + msecs_to_jiffies(msec) + 1); +} + +/** + * k_cancel_timer - cancel a timer + * @timer: pointer to timer structure + * + * Cancels a previously initialized timer. + * Can be called safely even if the timer is already inactive. + * + * WARNING: Must not be called when holding locks required by the timer's + * timeout routine, otherwise deadlock can occur on SMP systems! + */ + +static inline void k_cancel_timer(struct timer_list *timer) +{ + dbg("cancelling timer %p\n", timer); + del_timer_sync(timer); +} + +/** + * k_term_timer - terminate a timer + * @timer: pointer to timer structure + * + * Prevents further use of a previously initialized timer. + * + * WARNING: Caller must ensure timer isn't currently running. + * + * (Do not "enhance" this routine to automatically cancel an active timer, + * otherwise deadlock can arise when a timeout routine calls k_term_timer.) + */ + +static inline void k_term_timer(struct timer_list *timer) +{ + dbg("terminating timer %p\n", timer); +} + + +/* + * TIPC message buffer code + * + * TIPC message buffer headroom leaves room for 14 byte Ethernet header, + * while ensuring TIPC header is word aligned for quicker access + */ + +#define BUF_HEADROOM 16u + +struct tipc_skb_cb { + void *handle; +}; + +#define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) + + +static inline struct tipc_msg *buf_msg(struct sk_buff *skb) +{ + return (struct tipc_msg *)skb->data; +} + +/** + * buf_acquire - creates a TIPC message buffer + * @size: message size (including TIPC header) + * + * Returns a new buffer. Space is reserved for a data link header. + */ + +static inline struct sk_buff *buf_acquire(u32 size) +{ + struct sk_buff *skb; + unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; + + skb = alloc_skb(buf_size, GFP_ATOMIC); + if (skb) { + skb_reserve(skb, BUF_HEADROOM); + skb_put(skb, size); + skb->next = NULL; + } + return skb; +} + +/** + * buf_discard - frees a TIPC message buffer + * @skb: message buffer + * + * Frees a new buffer. If passed NULL, just returns. + */ + +static inline void buf_discard(struct sk_buff *skb) +{ + if (likely(skb != NULL)) + kfree_skb(skb); +} + +#endif diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c new file mode 100644 index 000000000000..7ed60a1cfbb8 --- /dev/null +++ b/net/tipc/dbg.c @@ -0,0 +1,395 @@ +/* + * net/tipc/dbg.c: TIPC print buffer routines for debuggign + * + * Copyright (c) 1996-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include "dbg.h" + +#define MAX_STRING 512 + +static char print_string[MAX_STRING]; +static spinlock_t print_lock = SPIN_LOCK_UNLOCKED; + +static struct print_buf cons_buf = { NULL, 0, NULL, NULL }; +struct print_buf *CONS = &cons_buf; + +static struct print_buf log_buf = { NULL, 0, NULL, NULL }; +struct print_buf *LOG = &log_buf; + + +#define FORMAT(PTR,LEN,FMT) \ +{\ + va_list args;\ + va_start(args, FMT);\ + LEN = vsprintf(PTR, FMT, args);\ + va_end(args);\ + *(PTR + LEN) = '\0';\ +} + +/* + * Locking policy when using print buffers. + * + * 1) Routines of the form printbuf_XXX() rely on the caller to prevent + * simultaneous use of the print buffer(s) being manipulated. + * 2) tipc_printf() uses 'print_lock' to prevent simultaneous use of + * 'print_string' and to protect its print buffer(s). + * 3) TEE() uses 'print_lock' to protect its print buffer(s). + * 4) Routines of the form log_XXX() uses 'print_lock' to protect LOG. + */ + +/** + * printbuf_init - initialize print buffer to empty + */ + +void printbuf_init(struct print_buf *pb, char *raw, u32 sz) +{ + if (!pb || !raw || (sz < (MAX_STRING + 1))) + return; + + pb->crs = pb->buf = raw; + pb->size = sz; + pb->next = 0; + pb->buf[0] = 0; + pb->buf[sz-1] = ~0; +} + +/** + * printbuf_reset - reinitialize print buffer to empty state + */ + +void printbuf_reset(struct print_buf *pb) +{ + if (pb && pb->buf) + printbuf_init(pb, pb->buf, pb->size); +} + +/** + * printbuf_empty - test if print buffer is in empty state + */ + +int printbuf_empty(struct print_buf *pb) +{ + return (!pb || !pb->buf || (pb->crs == pb->buf)); +} + +/** + * printbuf_validate - check for print buffer overflow + * + * Verifies that a print buffer has captured all data written to it. + * If data has been lost, linearize buffer and prepend an error message + * + * Returns length of print buffer data string (including trailing NULL) + */ + +int printbuf_validate(struct print_buf *pb) +{ + char *err = " *** PRINT BUFFER WRAPPED AROUND ***\n"; + char *cp_buf; + struct print_buf cb; + + if (!pb || !pb->buf) + return 0; + + if (pb->buf[pb->size - 1] == '\0') { + cp_buf = kmalloc(pb->size, GFP_ATOMIC); + if (cp_buf != NULL){ + printbuf_init(&cb, cp_buf, pb->size); + printbuf_move(&cb, pb); + printbuf_move(pb, &cb); + kfree(cp_buf); + memcpy(pb->buf, err, strlen(err)); + } else { + printbuf_reset(pb); + tipc_printf(pb, err); + } + } + return (pb->crs - pb->buf + 1); +} + +/** + * printbuf_move - move print buffer contents to another print buffer + * + * Current contents of destination print buffer (if any) are discarded. + * Source print buffer becomes empty if a successful move occurs. + */ + +void printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from) +{ + int len; + + /* Handle the cases where contents can't be moved */ + + if (!pb_to || !pb_to->buf) + return; + + if (!pb_from || !pb_from->buf) { + printbuf_reset(pb_to); + return; + } + + if (pb_to->size < pb_from->size) { + printbuf_reset(pb_to); + tipc_printf(pb_to, "*** PRINT BUFFER OVERFLOW ***"); + return; + } + + /* Copy data from char after cursor to end (if used) */ + len = pb_from->buf + pb_from->size - pb_from->crs - 2; + if ((pb_from->buf[pb_from->size-1] == 0) && (len > 0)) { + strcpy(pb_to->buf, pb_from->crs + 1); + pb_to->crs = pb_to->buf + len; + } else + pb_to->crs = pb_to->buf; + + /* Copy data from start to cursor (always) */ + len = pb_from->crs - pb_from->buf; + strcpy(pb_to->crs, pb_from->buf); + pb_to->crs += len; + + printbuf_reset(pb_from); +} + +/** + * tipc_printf - append formatted output to print buffer chain + */ + +void tipc_printf(struct print_buf *pb, const char *fmt, ...) +{ + int chars_to_add; + int chars_left; + char save_char; + struct print_buf *pb_next; + + spin_lock_bh(&print_lock); + FORMAT(print_string, chars_to_add, fmt); + if (chars_to_add >= MAX_STRING) + strcpy(print_string, "*** STRING TOO LONG ***"); + + while (pb) { + if (pb == CONS) + printk(print_string); + else if (pb->buf) { + chars_left = pb->buf + pb->size - pb->crs - 1; + if (chars_to_add <= chars_left) { + strcpy(pb->crs, print_string); + pb->crs += chars_to_add; + } else { + strcpy(pb->buf, print_string + chars_left); + save_char = print_string[chars_left]; + print_string[chars_left] = 0; + strcpy(pb->crs, print_string); + print_string[chars_left] = save_char; + pb->crs = pb->buf + chars_to_add - chars_left; + } + } + pb_next = pb->next; + pb->next = 0; + pb = pb_next; + } + spin_unlock_bh(&print_lock); +} + +/** + * TEE - perform next output operation on both print buffers + */ + +struct print_buf *TEE(struct print_buf *b0, struct print_buf *b1) +{ + struct print_buf *pb = b0; + + if (!b0 || (b0 == b1)) + return b1; + if (!b1) + return b0; + + spin_lock_bh(&print_lock); + while (pb->next) { + if ((pb->next == b1) || (pb->next == b0)) + pb->next = pb->next->next; + else + pb = pb->next; + } + pb->next = b1; + spin_unlock_bh(&print_lock); + return b0; +} + +/** + * print_to_console - write string of bytes to console in multiple chunks + */ + +static void print_to_console(char *crs, int len) +{ + int rest = len; + + while (rest > 0) { + int sz = rest < MAX_STRING ? rest : MAX_STRING; + char c = crs[sz]; + + crs[sz] = 0; + printk((const char *)crs); + crs[sz] = c; + rest -= sz; + crs += sz; + } +} + +/** + * printbuf_dump - write print buffer contents to console + */ + +static void printbuf_dump(struct print_buf *pb) +{ + int len; + + /* Dump print buffer from char after cursor to end (if used) */ + len = pb->buf + pb->size - pb->crs - 2; + if ((pb->buf[pb->size - 1] == 0) && (len > 0)) + print_to_console(pb->crs + 1, len); + + /* Dump print buffer from start to cursor (always) */ + len = pb->crs - pb->buf; + print_to_console(pb->buf, len); +} + +/** + * tipc_dump - dump non-console print buffer(s) to console + */ + +void tipc_dump(struct print_buf *pb, const char *fmt, ...) +{ + int len; + + spin_lock_bh(&print_lock); + FORMAT(CONS->buf, len, fmt); + printk(CONS->buf); + + for (; pb; pb = pb->next) { + if (pb == CONS) + continue; + printk("\n---- Start of dump,%s log ----\n\n", + (pb == LOG) ? "global" : "local"); + printbuf_dump(pb); + printbuf_reset(pb); + printk("\n-------- End of dump --------\n"); + } + spin_unlock_bh(&print_lock); +} + +/** + * log_stop - free up TIPC log print buffer + */ + +void log_stop(void) +{ + spin_lock_bh(&print_lock); + if (LOG->buf) { + kfree(LOG->buf); + LOG->buf = NULL; + } + spin_unlock_bh(&print_lock); +} + +/** + * log_reinit - set TIPC log print buffer to specified size + */ + +void log_reinit(int log_size) +{ + log_stop(); + + if (log_size) { + if (log_size <= MAX_STRING) + log_size = MAX_STRING + 1; + spin_lock_bh(&print_lock); + printbuf_init(LOG, kmalloc(log_size, GFP_ATOMIC), log_size); + spin_unlock_bh(&print_lock); + } +} + +/** + * log_resize - reconfigure size of TIPC log buffer + */ + +struct sk_buff *log_resize(const void *req_tlv_area, int req_tlv_space) +{ + u32 value; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + value = *(u32 *)TLV_DATA(req_tlv_area); + value = ntohl(value); + if (value != delimit(value, 0, 32768)) + return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (log size must be 0-32768)"); + log_reinit(value); + return cfg_reply_none(); +} + +/** + * log_dump - capture TIPC log buffer contents in configuration message + */ + +struct sk_buff *log_dump(void) +{ + struct sk_buff *reply; + + spin_lock_bh(&print_lock); + if (!LOG->buf) + reply = cfg_reply_ultra_string("log not activated\n"); + else if (printbuf_empty(LOG)) + reply = cfg_reply_ultra_string("log is empty\n"); + else { + struct tlv_desc *rep_tlv; + struct print_buf pb; + int str_len; + + str_len = min(LOG->size, 32768u); + reply = cfg_reply_alloc(TLV_SPACE(str_len)); + if (reply) { + rep_tlv = (struct tlv_desc *)reply->data; + printbuf_init(&pb, TLV_DATA(rep_tlv), str_len); + printbuf_move(&pb, LOG); + str_len = strlen(TLV_DATA(rep_tlv)) + 1; + skb_put(reply, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + } + } + spin_unlock_bh(&print_lock); + return reply; +} + diff --git a/net/tipc/dbg.h b/net/tipc/dbg.h new file mode 100644 index 000000000000..c6b2a64c224f --- /dev/null +++ b/net/tipc/dbg.h @@ -0,0 +1,59 @@ +/* + * net/tipc/dbg.h: Include file for TIPC print buffer routines + * + * Copyright (c) 1997-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_DBG_H +#define _TIPC_DBG_H + +struct print_buf { + char *buf; + u32 size; + char *crs; + struct print_buf *next; +}; + +void printbuf_init(struct print_buf *pb, char *buf, u32 sz); +void printbuf_reset(struct print_buf *pb); +int printbuf_empty(struct print_buf *pb); +int printbuf_validate(struct print_buf *pb); +void printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from); + +void log_reinit(int log_size); +void log_stop(void); + +struct sk_buff *log_resize(const void *req_tlv_area, int req_tlv_space); +struct sk_buff *log_dump(void); + +#endif diff --git a/net/tipc/discover.c b/net/tipc/discover.c new file mode 100644 index 000000000000..b106ef1621cc --- /dev/null +++ b/net/tipc/discover.c @@ -0,0 +1,318 @@ +/* + * net/tipc/discover.c + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "dbg.h" +#include "link.h" +#include "zone.h" +#include "discover.h" +#include "port.h" +#include "name_table.h" + +#define TIPC_LINK_REQ_INIT 125 /* min delay during bearer start up */ +#define TIPC_LINK_REQ_FAST 2000 /* normal delay if bearer has no links */ +#define TIPC_LINK_REQ_SLOW 600000 /* normal delay if bearer has links */ + +#if 0 +#define GET_NODE_INFO 300 +#define GET_NODE_INFO_RESULT 301 +#define FORWARD_LINK_PROBE 302 +#define LINK_REQUEST_REJECTED 303 +#define LINK_REQUEST_ACCEPTED 304 +#define DROP_LINK_REQUEST 305 +#define CHECK_LINK_COUNT 306 +#endif + +/* + * TODO: Most of the inter-cluster setup stuff should be + * rewritten, and be made conformant with specification. + */ + + +/** + * struct link_req - information about an ongoing link setup request + * @bearer: bearer issuing requests + * @dest: destination address for request messages + * @buf: request message to be (repeatedly) sent + * @timer: timer governing period between requests + * @timer_intv: current interval between requests (in ms) + */ +struct link_req { + struct bearer *bearer; + struct tipc_media_addr dest; + struct sk_buff *buf; + struct timer_list timer; + unsigned int timer_intv; +}; + + +#if 0 +int disc_create_link(const struct tipc_link_create *argv) +{ + /* + * Code for inter cluster link setup here + */ + return TIPC_OK; +} +#endif + +/* + * disc_lost_link(): A link has lost contact + */ + +void disc_link_event(u32 addr, char *name, int up) +{ + if (in_own_cluster(addr)) + return; + /* + * Code for inter cluster link setup here + */ +} + +/** + * disc_init_msg - initialize a link setup message + * @type: message type (request or response) + * @req_links: number of links associated with message + * @dest_domain: network domain of node(s) which should respond to message + * @b_ptr: ptr to bearer issuing message + */ + +struct sk_buff *disc_init_msg(u32 type, + u32 req_links, + u32 dest_domain, + struct bearer *b_ptr) +{ + struct sk_buff *buf = buf_acquire(DSC_H_SIZE); + struct tipc_msg *msg; + + if (buf) { + msg = buf_msg(buf); + msg_init(msg, LINK_CONFIG, type, TIPC_OK, DSC_H_SIZE, + dest_domain); + msg_set_non_seq(msg); + msg_set_req_links(msg, req_links); + msg_set_dest_domain(msg, dest_domain); + msg_set_bc_netid(msg, tipc_net_id); + msg_set_media_addr(msg, &b_ptr->publ.addr); + } + return buf; +} + +/** + * disc_recv_msg - handle incoming link setup message (request or response) + * @buf: buffer containing message + */ + +void disc_recv_msg(struct sk_buff *buf) +{ + struct bearer *b_ptr = (struct bearer *)TIPC_SKB_CB(buf)->handle; + struct link *link; + struct tipc_media_addr media_addr; + struct tipc_msg *msg = buf_msg(buf); + u32 dest = msg_dest_domain(msg); + u32 orig = msg_prevnode(msg); + u32 net_id = msg_bc_netid(msg); + u32 type = msg_type(msg); + + msg_get_media_addr(msg,&media_addr); + msg_dbg(msg, "RECV:"); + buf_discard(buf); + + if (net_id != tipc_net_id) + return; + if (!addr_domain_valid(dest)) + return; + if (!addr_node_valid(orig)) + return; + if (orig == tipc_own_addr) + return; + if (!in_scope(dest, tipc_own_addr)) + return; + if (is_slave(tipc_own_addr) && is_slave(orig)) + return; + if (is_slave(orig) && !in_own_cluster(orig)) + return; + if (in_own_cluster(orig)) { + /* Always accept link here */ + struct sk_buff *rbuf; + struct tipc_media_addr *addr; + struct node *n_ptr = node_find(orig); + int link_up; + dbg(" in own cluster\n"); + if (n_ptr == NULL) { + n_ptr = node_create(orig); + } + if (n_ptr == NULL) { + warn("Memory squeeze; Failed to create node\n"); + return; + } + spin_lock_bh(&n_ptr->lock); + link = n_ptr->links[b_ptr->identity]; + if (!link) { + dbg("creating link\n"); + link = link_create(b_ptr, orig, &media_addr); + if (!link) { + spin_unlock_bh(&n_ptr->lock); + return; + } + } + addr = &link->media_addr; + if (memcmp(addr, &media_addr, sizeof(*addr))) { + char addr_string[16]; + + warn("New bearer address for %s\n", + addr_string_fill(addr_string, orig)); + memcpy(addr, &media_addr, sizeof(*addr)); + link_reset(link); + } + link_up = link_is_up(link); + spin_unlock_bh(&n_ptr->lock); + if ((type == DSC_RESP_MSG) || link_up) + return; + rbuf = disc_init_msg(DSC_RESP_MSG, 1, orig, b_ptr); + if (rbuf != NULL) { + msg_dbg(buf_msg(rbuf),"SEND:"); + b_ptr->media->send_msg(rbuf, &b_ptr->publ, &media_addr); + buf_discard(rbuf); + } + } +} + +/** + * disc_stop_link_req - stop sending periodic link setup requests + * @req: ptr to link request structure + */ + +void disc_stop_link_req(struct link_req *req) +{ + if (!req) + return; + + k_cancel_timer(&req->timer); + k_term_timer(&req->timer); + buf_discard(req->buf); + kfree(req); +} + +/** + * disc_update_link_req - update frequency of periodic link setup requests + * @req: ptr to link request structure + */ + +void disc_update_link_req(struct link_req *req) +{ + if (!req) + return; + + if (req->timer_intv == TIPC_LINK_REQ_SLOW) { + if (!req->bearer->nodes.count) { + req->timer_intv = TIPC_LINK_REQ_FAST; + k_start_timer(&req->timer, req->timer_intv); + } + } else if (req->timer_intv == TIPC_LINK_REQ_FAST) { + if (req->bearer->nodes.count) { + req->timer_intv = TIPC_LINK_REQ_SLOW; + k_start_timer(&req->timer, req->timer_intv); + } + } else { + /* leave timer "as is" if haven't yet reached a "normal" rate */ + } +} + +/** + * disc_timeout - send a periodic link setup request + * @req: ptr to link request structure + * + * Called whenever a link setup request timer associated with a bearer expires. + */ + +static void disc_timeout(struct link_req *req) +{ + spin_lock_bh(&req->bearer->publ.lock); + + req->bearer->media->send_msg(req->buf, &req->bearer->publ, &req->dest); + + if ((req->timer_intv == TIPC_LINK_REQ_SLOW) || + (req->timer_intv == TIPC_LINK_REQ_FAST)) { + /* leave timer interval "as is" if already at a "normal" rate */ + } else { + req->timer_intv *= 2; + if (req->timer_intv > TIPC_LINK_REQ_SLOW) + req->timer_intv = TIPC_LINK_REQ_SLOW; + if ((req->timer_intv == TIPC_LINK_REQ_FAST) && + (req->bearer->nodes.count)) + req->timer_intv = TIPC_LINK_REQ_SLOW; + } + k_start_timer(&req->timer, req->timer_intv); + + spin_unlock_bh(&req->bearer->publ.lock); +} + +/** + * disc_init_link_req - start sending periodic link setup requests + * @b_ptr: ptr to bearer issuing requests + * @dest: destination address for request messages + * @dest_domain: network domain of node(s) which should respond to message + * @req_links: max number of desired links + * + * Returns pointer to link request structure, or NULL if unable to create. + */ + +struct link_req *disc_init_link_req(struct bearer *b_ptr, + const struct tipc_media_addr *dest, + u32 dest_domain, + u32 req_links) +{ + struct link_req *req; + + req = (struct link_req *)kmalloc(sizeof(*req), GFP_ATOMIC); + if (!req) + return NULL; + + req->buf = disc_init_msg(DSC_REQ_MSG, req_links, dest_domain, b_ptr); + if (!req->buf) { + kfree(req); + return NULL; + } + + memcpy(&req->dest, dest, sizeof(*dest)); + req->bearer = b_ptr; + req->timer_intv = TIPC_LINK_REQ_INIT; + k_init_timer(&req->timer, (Handler)disc_timeout, (unsigned long)req); + k_start_timer(&req->timer, req->timer_intv); + return req; +} + diff --git a/net/tipc/discover.h b/net/tipc/discover.h new file mode 100644 index 000000000000..2a6114d91626 --- /dev/null +++ b/net/tipc/discover.h @@ -0,0 +1,58 @@ +/* + * net/tipc/discover.h + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_DISCOVER_H +#define _TIPC_DISCOVER_H + +#include <linux/tipc.h> + +struct link_req; + +struct link_req *disc_init_link_req(struct bearer *b_ptr, + const struct tipc_media_addr *dest, + u32 dest_domain, + u32 req_links); +void disc_update_link_req(struct link_req *req); +void disc_stop_link_req(struct link_req *req); + +void disc_recv_msg(struct sk_buff *buf); + +void disc_link_event(u32 addr, char *name, int up); +#if 0 +int disc_create_link(const struct tipc_link_create *argv); +#endif + +#endif diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c new file mode 100644 index 000000000000..34d0462db3aa --- /dev/null +++ b/net/tipc/eth_media.c @@ -0,0 +1,299 @@ +/* + * net/tipc/eth_media.c: Ethernet bearer support for TIPC + * + * Copyright (c) 2001-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <net/tipc/tipc.h> +#include <net/tipc/tipc_bearer.h> +#include <net/tipc/tipc_msg.h> +#include <linux/netdevice.h> +#include <linux/version.h> + +#define MAX_ETH_BEARERS 2 +#define TIPC_PROTOCOL 0x88ca +#define ETH_LINK_PRIORITY 10 +#define ETH_LINK_TOLERANCE TIPC_DEF_LINK_TOL + + +/** + * struct eth_bearer - Ethernet bearer data structure + * @bearer: ptr to associated "generic" bearer structure + * @dev: ptr to associated Ethernet network device + * @tipc_packet_type: used in binding TIPC to Ethernet driver + */ + +struct eth_bearer { + struct tipc_bearer *bearer; + struct net_device *dev; + struct packet_type tipc_packet_type; +}; + +static struct eth_bearer eth_bearers[MAX_ETH_BEARERS]; +static int eth_started = 0; +static struct notifier_block notifier; + +/** + * send_msg - send a TIPC message out over an Ethernet interface + */ + +static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, + struct tipc_media_addr *dest) +{ + struct sk_buff *clone; + struct net_device *dev; + + clone = skb_clone(buf, GFP_ATOMIC); + if (clone) { + clone->nh.raw = clone->data; + dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev; + clone->dev = dev; + dev->hard_header(clone, dev, TIPC_PROTOCOL, + &dest->dev_addr.eth_addr, + dev->dev_addr, clone->len); + dev_queue_xmit(clone); + } + return TIPC_OK; +} + +/** + * recv_msg - handle incoming TIPC message from an Ethernet interface + * + * Routine truncates any Ethernet padding/CRC appended to the message, + * and ensures message size matches actual length + */ + +static int recv_msg(struct sk_buff *buf, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct eth_bearer *eb_ptr = (struct eth_bearer *)pt->af_packet_priv; + u32 size; + + if (likely(eb_ptr->bearer)) { + size = msg_size((struct tipc_msg *)buf->data); + skb_trim(buf, size); + if (likely(buf->len == size)) { + buf->next = NULL; + tipc_recv_msg(buf, eb_ptr->bearer); + } else { + kfree_skb(buf); + } + } else { + kfree_skb(buf); + } + return TIPC_OK; +} + +/** + * enable_bearer - attach TIPC bearer to an Ethernet interface + */ + +static int enable_bearer(struct tipc_bearer *tb_ptr) +{ + struct net_device *dev = dev_base; + struct eth_bearer *eb_ptr = ð_bearers[0]; + struct eth_bearer *stop = ð_bearers[MAX_ETH_BEARERS]; + char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1; + + /* Find device with specified name */ + + while (dev && dev->name && + (memcmp(dev->name, driver_name, strlen(dev->name)))) { + dev = dev->next; + } + if (!dev) + return -ENODEV; + + /* Find Ethernet bearer for device (or create one) */ + + for (;(eb_ptr != stop) && eb_ptr->dev && (eb_ptr->dev != dev); eb_ptr++); + if (eb_ptr == stop) + return -EDQUOT; + if (!eb_ptr->dev) { + eb_ptr->dev = dev; + eb_ptr->tipc_packet_type.type = __constant_htons(TIPC_PROTOCOL); + eb_ptr->tipc_packet_type.dev = dev; + eb_ptr->tipc_packet_type.func = recv_msg; + eb_ptr->tipc_packet_type.af_packet_priv = eb_ptr; + INIT_LIST_HEAD(&(eb_ptr->tipc_packet_type.list)); + dev_hold(dev); + dev_add_pack(&eb_ptr->tipc_packet_type); + } + + /* Associate TIPC bearer with Ethernet bearer */ + + eb_ptr->bearer = tb_ptr; + tb_ptr->usr_handle = (void *)eb_ptr; + tb_ptr->mtu = dev->mtu; + tb_ptr->blocked = 0; + tb_ptr->addr.type = htonl(TIPC_MEDIA_TYPE_ETH); + memcpy(&tb_ptr->addr.dev_addr, &dev->dev_addr, ETH_ALEN); + return 0; +} + +/** + * disable_bearer - detach TIPC bearer from an Ethernet interface + * + * We really should do dev_remove_pack() here, but this function can not be + * called at tasklet level. => Use eth_bearer->bearer as a flag to throw away + * incoming buffers, & postpone dev_remove_pack() to eth_media_stop() on exit. + */ + +static void disable_bearer(struct tipc_bearer *tb_ptr) +{ + ((struct eth_bearer *)tb_ptr->usr_handle)->bearer = 0; +} + +/** + * recv_notification - handle device updates from OS + * + * Change the state of the Ethernet bearer (if any) associated with the + * specified device. + */ + +static int recv_notification(struct notifier_block *nb, unsigned long evt, + void *dv) +{ + struct net_device *dev = (struct net_device *)dv; + struct eth_bearer *eb_ptr = ð_bearers[0]; + struct eth_bearer *stop = ð_bearers[MAX_ETH_BEARERS]; + + while ((eb_ptr->dev != dev)) { + if (++eb_ptr == stop) + return NOTIFY_DONE; /* couldn't find device */ + } + if (!eb_ptr->bearer) + return NOTIFY_DONE; /* bearer had been disabled */ + + eb_ptr->bearer->mtu = dev->mtu; + + switch (evt) { + case NETDEV_CHANGE: + if (netif_carrier_ok(dev)) + tipc_continue(eb_ptr->bearer); + else + tipc_block_bearer(eb_ptr->bearer->name); + break; + case NETDEV_UP: + tipc_continue(eb_ptr->bearer); + break; + case NETDEV_DOWN: + tipc_block_bearer(eb_ptr->bearer->name); + break; + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + tipc_block_bearer(eb_ptr->bearer->name); + tipc_continue(eb_ptr->bearer); + break; + case NETDEV_UNREGISTER: + case NETDEV_CHANGENAME: + tipc_disable_bearer(eb_ptr->bearer->name); + break; + } + return NOTIFY_OK; +} + +/** + * eth_addr2str - convert Ethernet address to string + */ + +static char *eth_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) +{ + unchar *addr = (unchar *)&a->dev_addr; + + if (str_size < 18) + *str_buf = '\0'; + else + sprintf(str_buf, "%02x:%02x:%02x:%02x:%02x:%02x", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + return str_buf; +} + +/** + * eth_media_start - activate Ethernet bearer support + * + * Register Ethernet media type with TIPC bearer code. Also register + * with OS for notifications about device state changes. + */ + +int eth_media_start(void) +{ + struct tipc_media_addr bcast_addr; + int res; + + if (eth_started) + return -EINVAL; + + memset(&bcast_addr, 0xff, sizeof(bcast_addr)); + memset(eth_bearers, 0, sizeof(eth_bearers)); + + res = tipc_register_media(TIPC_MEDIA_TYPE_ETH, "eth", + enable_bearer, disable_bearer, send_msg, + eth_addr2str, &bcast_addr, ETH_LINK_PRIORITY, + ETH_LINK_TOLERANCE, TIPC_DEF_LINK_WIN); + if (res) + return res; + + notifier.notifier_call = &recv_notification; + notifier.priority = 0; + res = register_netdevice_notifier(¬ifier); + if (!res) + eth_started = 1; + return res; +} + +/** + * eth_media_stop - deactivate Ethernet bearer support + */ + +void eth_media_stop(void) +{ + int i; + + if (!eth_started) + return; + + unregister_netdevice_notifier(¬ifier); + for (i = 0; i < MAX_ETH_BEARERS ; i++) { + if (eth_bearers[i].bearer) { + eth_bearers[i].bearer->blocked = 1; + eth_bearers[i].bearer = 0; + } + if (eth_bearers[i].dev) { + dev_remove_pack(ð_bearers[i].tipc_packet_type); + dev_put(eth_bearers[i].dev); + } + } + memset(ð_bearers, 0, sizeof(eth_bearers)); + eth_started = 0; +} diff --git a/net/tipc/handler.c b/net/tipc/handler.c new file mode 100644 index 000000000000..f320010f8a65 --- /dev/null +++ b/net/tipc/handler.c @@ -0,0 +1,132 @@ +/* + * net/tipc/handler.c: TIPC signal handling + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" + +struct queue_item { + struct list_head next_signal; + void (*handler) (unsigned long); + unsigned long data; +}; + +static kmem_cache_t *tipc_queue_item_cache; +static struct list_head signal_queue_head; +static spinlock_t qitem_lock = SPIN_LOCK_UNLOCKED; +static int handler_enabled = 0; + +static void process_signal_queue(unsigned long dummy); + +static DECLARE_TASKLET_DISABLED(tipc_tasklet, process_signal_queue, 0); + + +unsigned int k_signal(Handler routine, unsigned long argument) +{ + struct queue_item *item; + + if (!handler_enabled) { + err("Signal request ignored by handler\n"); + return -ENOPROTOOPT; + } + + spin_lock_bh(&qitem_lock); + item = kmem_cache_alloc(tipc_queue_item_cache, GFP_ATOMIC); + if (!item) { + err("Signal queue out of memory\n"); + spin_unlock_bh(&qitem_lock); + return -ENOMEM; + } + item->handler = routine; + item->data = argument; + list_add_tail(&item->next_signal, &signal_queue_head); + spin_unlock_bh(&qitem_lock); + tasklet_schedule(&tipc_tasklet); + return 0; +} + +static void process_signal_queue(unsigned long dummy) +{ + struct queue_item *__volatile__ item; + struct list_head *l, *n; + + spin_lock_bh(&qitem_lock); + list_for_each_safe(l, n, &signal_queue_head) { + item = list_entry(l, struct queue_item, next_signal); + list_del(&item->next_signal); + spin_unlock_bh(&qitem_lock); + item->handler(item->data); + spin_lock_bh(&qitem_lock); + kmem_cache_free(tipc_queue_item_cache, item); + } + spin_unlock_bh(&qitem_lock); +} + +int handler_start(void) +{ + tipc_queue_item_cache = + kmem_cache_create("tipc_queue_items", sizeof(struct queue_item), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!tipc_queue_item_cache) + return -ENOMEM; + + INIT_LIST_HEAD(&signal_queue_head); + tasklet_enable(&tipc_tasklet); + handler_enabled = 1; + return 0; +} + +void handler_stop(void) +{ + struct list_head *l, *n; + struct queue_item *item; + + if (!handler_enabled) + return; + + handler_enabled = 0; + tasklet_disable(&tipc_tasklet); + tasklet_kill(&tipc_tasklet); + + spin_lock_bh(&qitem_lock); + list_for_each_safe(l, n, &signal_queue_head) { + item = list_entry(l, struct queue_item, next_signal); + list_del(&item->next_signal); + kmem_cache_free(tipc_queue_item_cache, item); + } + spin_unlock_bh(&qitem_lock); + + kmem_cache_destroy(tipc_queue_item_cache); +} + diff --git a/net/tipc/link.c b/net/tipc/link.c new file mode 100644 index 000000000000..7265f4be4766 --- /dev/null +++ b/net/tipc/link.c @@ -0,0 +1,3167 @@ +/* + * net/tipc/link.c: TIPC link code + * + * Copyright (c) 1996-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "dbg.h" +#include "link.h" +#include "net.h" +#include "node.h" +#include "port.h" +#include "addr.h" +#include "node_subscr.h" +#include "name_distr.h" +#include "bearer.h" +#include "name_table.h" +#include "discover.h" +#include "config.h" +#include "bcast.h" + + +/* + * Limit for deferred reception queue: + */ + +#define DEF_QUEUE_LIMIT 256u + +/* + * Link state events: + */ + +#define STARTING_EVT 856384768 /* link processing trigger */ +#define TRAFFIC_MSG_EVT 560815u /* rx'd ??? */ +#define TIMEOUT_EVT 560817u /* link timer expired */ + +/* + * The following two 'message types' is really just implementation + * data conveniently stored in the message header. + * They must not be considered part of the protocol + */ +#define OPEN_MSG 0 +#define CLOSED_MSG 1 + +/* + * State value stored in 'exp_msg_count' + */ + +#define START_CHANGEOVER 100000u + +/** + * struct link_name - deconstructed link name + * @addr_local: network address of node at this end + * @if_local: name of interface at this end + * @addr_peer: network address of node at far end + * @if_peer: name of interface at far end + */ + +struct link_name { + u32 addr_local; + char if_local[TIPC_MAX_IF_NAME]; + u32 addr_peer; + char if_peer[TIPC_MAX_IF_NAME]; +}; + +#if 0 + +/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */ + +/** + * struct link_event - link up/down event notification + */ + +struct link_event { + u32 addr; + int up; + void (*fcn)(u32, char *, int); + char name[TIPC_MAX_LINK_NAME]; +}; + +#endif + +static void link_handle_out_of_seq_msg(struct link *l_ptr, + struct sk_buff *buf); +static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf); +static int link_recv_changeover_msg(struct link **l_ptr, struct sk_buff **buf); +static void link_set_supervision_props(struct link *l_ptr, u32 tolerance); +static int link_send_sections_long(struct port *sender, + struct iovec const *msg_sect, + u32 num_sect, u32 destnode); +static void link_check_defragm_bufs(struct link *l_ptr); +static void link_state_event(struct link *l_ptr, u32 event); +static void link_reset_statistics(struct link *l_ptr); +static void link_print(struct link *l_ptr, struct print_buf *buf, + const char *str); + +/* + * Debugging code used by link routines only + * + * When debugging link problems on a system that has multiple links, + * the standard TIPC debugging routines may not be useful since they + * allow the output from multiple links to be intermixed. For this reason + * routines of the form "dbg_link_XXX()" have been created that will capture + * debug info into a link's personal print buffer, which can then be dumped + * into the TIPC system log (LOG) upon request. + * + * To enable per-link debugging, use LINK_LOG_BUF_SIZE to specify the size + * of the print buffer used by each link. If LINK_LOG_BUF_SIZE is set to 0, + * the dbg_link_XXX() routines simply send their output to the standard + * debug print buffer (DBG_OUTPUT), if it has been defined; this can be useful + * when there is only a single link in the system being debugged. + * + * Notes: + * - When enabled, LINK_LOG_BUF_SIZE should be set to at least 1000 (bytes) + * - "l_ptr" must be valid when using dbg_link_XXX() macros + */ + +#define LINK_LOG_BUF_SIZE 0 + +#define dbg_link(fmt, arg...) do {if (LINK_LOG_BUF_SIZE) tipc_printf(&l_ptr->print_buf, fmt, ## arg); } while(0) +#define dbg_link_msg(msg, txt) do {if (LINK_LOG_BUF_SIZE) msg_print(&l_ptr->print_buf, msg, txt); } while(0) +#define dbg_link_state(txt) do {if (LINK_LOG_BUF_SIZE) link_print(l_ptr, &l_ptr->print_buf, txt); } while(0) +#define dbg_link_dump() do { \ + if (LINK_LOG_BUF_SIZE) { \ + tipc_printf(LOG, "\n\nDumping link <%s>:\n", l_ptr->name); \ + printbuf_move(LOG, &l_ptr->print_buf); \ + } \ +} while (0) + +static inline void dbg_print_link(struct link *l_ptr, const char *str) +{ + if (DBG_OUTPUT) + link_print(l_ptr, DBG_OUTPUT, str); +} + +static inline void dbg_print_buf_chain(struct sk_buff *root_buf) +{ + if (DBG_OUTPUT) { + struct sk_buff *buf = root_buf; + + while (buf) { + msg_dbg(buf_msg(buf), "In chain: "); + buf = buf->next; + } + } +} + +/* + * Simple inlined link routines + */ + +static inline unsigned int align(unsigned int i) +{ + return (i + 3) & ~3u; +} + +static inline int link_working_working(struct link *l_ptr) +{ + return (l_ptr->state == WORKING_WORKING); +} + +static inline int link_working_unknown(struct link *l_ptr) +{ + return (l_ptr->state == WORKING_UNKNOWN); +} + +static inline int link_reset_unknown(struct link *l_ptr) +{ + return (l_ptr->state == RESET_UNKNOWN); +} + +static inline int link_reset_reset(struct link *l_ptr) +{ + return (l_ptr->state == RESET_RESET); +} + +static inline int link_blocked(struct link *l_ptr) +{ + return (l_ptr->exp_msg_count || l_ptr->blocked); +} + +static inline int link_congested(struct link *l_ptr) +{ + return (l_ptr->out_queue_size >= l_ptr->queue_limit[0]); +} + +static inline u32 link_max_pkt(struct link *l_ptr) +{ + return l_ptr->max_pkt; +} + +static inline void link_init_max_pkt(struct link *l_ptr) +{ + u32 max_pkt; + + max_pkt = (l_ptr->b_ptr->publ.mtu & ~3); + if (max_pkt > MAX_MSG_SIZE) + max_pkt = MAX_MSG_SIZE; + + l_ptr->max_pkt_target = max_pkt; + if (l_ptr->max_pkt_target < MAX_PKT_DEFAULT) + l_ptr->max_pkt = l_ptr->max_pkt_target; + else + l_ptr->max_pkt = MAX_PKT_DEFAULT; + + l_ptr->max_pkt_probes = 0; +} + +static inline u32 link_next_sent(struct link *l_ptr) +{ + if (l_ptr->next_out) + return msg_seqno(buf_msg(l_ptr->next_out)); + return mod(l_ptr->next_out_no); +} + +static inline u32 link_last_sent(struct link *l_ptr) +{ + return mod(link_next_sent(l_ptr) - 1); +} + +/* + * Simple non-inlined link routines (i.e. referenced outside this file) + */ + +int link_is_up(struct link *l_ptr) +{ + if (!l_ptr) + return 0; + return (link_working_working(l_ptr) || link_working_unknown(l_ptr)); +} + +int link_is_active(struct link *l_ptr) +{ + return ((l_ptr->owner->active_links[0] == l_ptr) || + (l_ptr->owner->active_links[1] == l_ptr)); +} + +/** + * link_name_validate - validate & (optionally) deconstruct link name + * @name - ptr to link name string + * @name_parts - ptr to area for link name components (or NULL if not needed) + * + * Returns 1 if link name is valid, otherwise 0. + */ + +static int link_name_validate(const char *name, struct link_name *name_parts) +{ + char name_copy[TIPC_MAX_LINK_NAME]; + char *addr_local; + char *if_local; + char *addr_peer; + char *if_peer; + char dummy; + u32 z_local, c_local, n_local; + u32 z_peer, c_peer, n_peer; + u32 if_local_len; + u32 if_peer_len; + + /* copy link name & ensure length is OK */ + + name_copy[TIPC_MAX_LINK_NAME - 1] = 0; + /* need above in case non-Posix strncpy() doesn't pad with nulls */ + strncpy(name_copy, name, TIPC_MAX_LINK_NAME); + if (name_copy[TIPC_MAX_LINK_NAME - 1] != 0) + return 0; + + /* ensure all component parts of link name are present */ + + addr_local = name_copy; + if ((if_local = strchr(addr_local, ':')) == NULL) + return 0; + *(if_local++) = 0; + if ((addr_peer = strchr(if_local, '-')) == NULL) + return 0; + *(addr_peer++) = 0; + if_local_len = addr_peer - if_local; + if ((if_peer = strchr(addr_peer, ':')) == NULL) + return 0; + *(if_peer++) = 0; + if_peer_len = strlen(if_peer) + 1; + + /* validate component parts of link name */ + + if ((sscanf(addr_local, "%u.%u.%u%c", + &z_local, &c_local, &n_local, &dummy) != 3) || + (sscanf(addr_peer, "%u.%u.%u%c", + &z_peer, &c_peer, &n_peer, &dummy) != 3) || + (z_local > 255) || (c_local > 4095) || (n_local > 4095) || + (z_peer > 255) || (c_peer > 4095) || (n_peer > 4095) || + (if_local_len <= 1) || (if_local_len > TIPC_MAX_IF_NAME) || + (if_peer_len <= 1) || (if_peer_len > TIPC_MAX_IF_NAME) || + (strspn(if_local, tipc_alphabet) != (if_local_len - 1)) || + (strspn(if_peer, tipc_alphabet) != (if_peer_len - 1))) + return 0; + + /* return link name components, if necessary */ + + if (name_parts) { + name_parts->addr_local = tipc_addr(z_local, c_local, n_local); + strcpy(name_parts->if_local, if_local); + name_parts->addr_peer = tipc_addr(z_peer, c_peer, n_peer); + strcpy(name_parts->if_peer, if_peer); + } + return 1; +} + +/** + * link_timeout - handle expiration of link timer + * @l_ptr: pointer to link + * + * This routine must not grab "net_lock" to avoid a potential deadlock conflict + * with link_delete(). (There is no risk that the node will be deleted by + * another thread because link_delete() always cancels the link timer before + * node_delete() is called.) + */ + +static void link_timeout(struct link *l_ptr) +{ + node_lock(l_ptr->owner); + + /* update counters used in statistical profiling of send traffic */ + + l_ptr->stats.accu_queue_sz += l_ptr->out_queue_size; + l_ptr->stats.queue_sz_counts++; + + if (l_ptr->out_queue_size > l_ptr->stats.max_queue_sz) + l_ptr->stats.max_queue_sz = l_ptr->out_queue_size; + + if (l_ptr->first_out) { + struct tipc_msg *msg = buf_msg(l_ptr->first_out); + u32 length = msg_size(msg); + + if ((msg_user(msg) == MSG_FRAGMENTER) + && (msg_type(msg) == FIRST_FRAGMENT)) { + length = msg_size(msg_get_wrapped(msg)); + } + if (length) { + l_ptr->stats.msg_lengths_total += length; + l_ptr->stats.msg_length_counts++; + if (length <= 64) + l_ptr->stats.msg_length_profile[0]++; + else if (length <= 256) + l_ptr->stats.msg_length_profile[1]++; + else if (length <= 1024) + l_ptr->stats.msg_length_profile[2]++; + else if (length <= 4096) + l_ptr->stats.msg_length_profile[3]++; + else if (length <= 16384) + l_ptr->stats.msg_length_profile[4]++; + else if (length <= 32768) + l_ptr->stats.msg_length_profile[5]++; + else + l_ptr->stats.msg_length_profile[6]++; + } + } + + /* do all other link processing performed on a periodic basis */ + + link_check_defragm_bufs(l_ptr); + + link_state_event(l_ptr, TIMEOUT_EVT); + + if (l_ptr->next_out) + link_push_queue(l_ptr); + + node_unlock(l_ptr->owner); +} + +static inline void link_set_timer(struct link *l_ptr, u32 time) +{ + k_start_timer(&l_ptr->timer, time); +} + +/** + * link_create - create a new link + * @b_ptr: pointer to associated bearer + * @peer: network address of node at other end of link + * @media_addr: media address to use when sending messages over link + * + * Returns pointer to link. + */ + +struct link *link_create(struct bearer *b_ptr, const u32 peer, + const struct tipc_media_addr *media_addr) +{ + struct link *l_ptr; + struct tipc_msg *msg; + char *if_name; + + l_ptr = (struct link *)kmalloc(sizeof(*l_ptr), GFP_ATOMIC); + if (!l_ptr) { + warn("Memory squeeze; Failed to create link\n"); + return NULL; + } + memset(l_ptr, 0, sizeof(*l_ptr)); + + l_ptr->addr = peer; + if_name = strchr(b_ptr->publ.name, ':') + 1; + sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:", + tipc_zone(tipc_own_addr), tipc_cluster(tipc_own_addr), + tipc_node(tipc_own_addr), + if_name, + tipc_zone(peer), tipc_cluster(peer), tipc_node(peer)); + /* note: peer i/f is appended to link name by reset/activate */ + memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr)); + k_init_timer(&l_ptr->timer, (Handler)link_timeout, (unsigned long)l_ptr); + list_add_tail(&l_ptr->link_list, &b_ptr->links); + l_ptr->checkpoint = 1; + l_ptr->b_ptr = b_ptr; + link_set_supervision_props(l_ptr, b_ptr->media->tolerance); + l_ptr->state = RESET_UNKNOWN; + + l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg; + msg = l_ptr->pmsg; + msg_init(msg, LINK_PROTOCOL, RESET_MSG, TIPC_OK, INT_H_SIZE, l_ptr->addr); + msg_set_size(msg, sizeof(l_ptr->proto_msg)); + msg_set_session(msg, tipc_random); + msg_set_bearer_id(msg, b_ptr->identity); + strcpy((char *)msg_data(msg), if_name); + + l_ptr->priority = b_ptr->priority; + link_set_queue_limits(l_ptr, b_ptr->media->window); + + link_init_max_pkt(l_ptr); + + l_ptr->next_out_no = 1; + INIT_LIST_HEAD(&l_ptr->waiting_ports); + + link_reset_statistics(l_ptr); + + l_ptr->owner = node_attach_link(l_ptr); + if (!l_ptr->owner) { + kfree(l_ptr); + return NULL; + } + + if (LINK_LOG_BUF_SIZE) { + char *pb = kmalloc(LINK_LOG_BUF_SIZE, GFP_ATOMIC); + + if (!pb) { + kfree(l_ptr); + warn("Memory squeeze; Failed to create link\n"); + return NULL; + } + printbuf_init(&l_ptr->print_buf, pb, LINK_LOG_BUF_SIZE); + } + + k_signal((Handler)link_start, (unsigned long)l_ptr); + + dbg("link_create(): tolerance = %u,cont intv = %u, abort_limit = %u\n", + l_ptr->tolerance, l_ptr->continuity_interval, l_ptr->abort_limit); + + return l_ptr; +} + +/** + * link_delete - delete a link + * @l_ptr: pointer to link + * + * Note: 'net_lock' is write_locked, bearer is locked. + * This routine must not grab the node lock until after link timer cancellation + * to avoid a potential deadlock situation. + */ + +void link_delete(struct link *l_ptr) +{ + if (!l_ptr) { + err("Attempt to delete non-existent link\n"); + return; + } + + dbg("link_delete()\n"); + + k_cancel_timer(&l_ptr->timer); + + node_lock(l_ptr->owner); + link_reset(l_ptr); + node_detach_link(l_ptr->owner, l_ptr); + link_stop(l_ptr); + list_del_init(&l_ptr->link_list); + if (LINK_LOG_BUF_SIZE) + kfree(l_ptr->print_buf.buf); + node_unlock(l_ptr->owner); + k_term_timer(&l_ptr->timer); + kfree(l_ptr); +} + +void link_start(struct link *l_ptr) +{ + dbg("link_start %x\n", l_ptr); + link_state_event(l_ptr, STARTING_EVT); +} + +/** + * link_schedule_port - schedule port for deferred sending + * @l_ptr: pointer to link + * @origport: reference to sending port + * @sz: amount of data to be sent + * + * Schedules port for renewed sending of messages after link congestion + * has abated. + */ + +static int link_schedule_port(struct link *l_ptr, u32 origport, u32 sz) +{ + struct port *p_ptr; + + spin_lock_bh(&port_list_lock); + p_ptr = port_lock(origport); + if (p_ptr) { + if (!p_ptr->wakeup) + goto exit; + if (!list_empty(&p_ptr->wait_list)) + goto exit; + p_ptr->congested_link = l_ptr; + p_ptr->publ.congested = 1; + p_ptr->waiting_pkts = 1 + ((sz - 1) / link_max_pkt(l_ptr)); + list_add_tail(&p_ptr->wait_list, &l_ptr->waiting_ports); + l_ptr->stats.link_congs++; +exit: + port_unlock(p_ptr); + } + spin_unlock_bh(&port_list_lock); + return -ELINKCONG; +} + +void link_wakeup_ports(struct link *l_ptr, int all) +{ + struct port *p_ptr; + struct port *temp_p_ptr; + int win = l_ptr->queue_limit[0] - l_ptr->out_queue_size; + + if (all) + win = 100000; + if (win <= 0) + return; + if (!spin_trylock_bh(&port_list_lock)) + return; + if (link_congested(l_ptr)) + goto exit; + list_for_each_entry_safe(p_ptr, temp_p_ptr, &l_ptr->waiting_ports, + wait_list) { + if (win <= 0) + break; + list_del_init(&p_ptr->wait_list); + p_ptr->congested_link = 0; + assert(p_ptr->wakeup); + spin_lock_bh(p_ptr->publ.lock); + p_ptr->publ.congested = 0; + p_ptr->wakeup(&p_ptr->publ); + win -= p_ptr->waiting_pkts; + spin_unlock_bh(p_ptr->publ.lock); + } + +exit: + spin_unlock_bh(&port_list_lock); +} + +/** + * link_release_outqueue - purge link's outbound message queue + * @l_ptr: pointer to link + */ + +static void link_release_outqueue(struct link *l_ptr) +{ + struct sk_buff *buf = l_ptr->first_out; + struct sk_buff *next; + + while (buf) { + next = buf->next; + buf_discard(buf); + buf = next; + } + l_ptr->first_out = NULL; + l_ptr->out_queue_size = 0; +} + +/** + * link_reset_fragments - purge link's inbound message fragments queue + * @l_ptr: pointer to link + */ + +void link_reset_fragments(struct link *l_ptr) +{ + struct sk_buff *buf = l_ptr->defragm_buf; + struct sk_buff *next; + + while (buf) { + next = buf->next; + buf_discard(buf); + buf = next; + } + l_ptr->defragm_buf = NULL; +} + +/** + * link_stop - purge all inbound and outbound messages associated with link + * @l_ptr: pointer to link + */ + +void link_stop(struct link *l_ptr) +{ + struct sk_buff *buf; + struct sk_buff *next; + + buf = l_ptr->oldest_deferred_in; + while (buf) { + next = buf->next; + buf_discard(buf); + buf = next; + } + + buf = l_ptr->first_out; + while (buf) { + next = buf->next; + buf_discard(buf); + buf = next; + } + + link_reset_fragments(l_ptr); + + buf_discard(l_ptr->proto_msg_queue); + l_ptr->proto_msg_queue = NULL; +} + +#if 0 + +/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */ + +static void link_recv_event(struct link_event *ev) +{ + ev->fcn(ev->addr, ev->name, ev->up); + kfree(ev); +} + +static void link_send_event(void (*fcn)(u32 a, char *n, int up), + struct link *l_ptr, int up) +{ + struct link_event *ev; + + ev = kmalloc(sizeof(*ev), GFP_ATOMIC); + if (!ev) { + warn("Link event allocation failure\n"); + return; + } + ev->addr = l_ptr->addr; + ev->up = up; + ev->fcn = fcn; + memcpy(ev->name, l_ptr->name, TIPC_MAX_LINK_NAME); + k_signal((Handler)link_recv_event, (unsigned long)ev); +} + +#else + +#define link_send_event(fcn, l_ptr, up) do { } while (0) + +#endif + +void link_reset(struct link *l_ptr) +{ + struct sk_buff *buf; + u32 prev_state = l_ptr->state; + u32 checkpoint = l_ptr->next_in_no; + + msg_set_session(l_ptr->pmsg, msg_session(l_ptr->pmsg) + 1); + + /* Link is down, accept any session: */ + l_ptr->peer_session = 0; + + /* Prepare for max packet size negotiation */ + link_init_max_pkt(l_ptr); + + l_ptr->state = RESET_UNKNOWN; + dbg_link_state("Resetting Link\n"); + + if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET)) + return; + + node_link_down(l_ptr->owner, l_ptr); + bearer_remove_dest(l_ptr->b_ptr, l_ptr->addr); +#if 0 + tipc_printf(CONS, "\nReset link <%s>\n", l_ptr->name); + dbg_link_dump(); +#endif + if (node_has_active_links(l_ptr->owner) && + l_ptr->owner->permit_changeover) { + l_ptr->reset_checkpoint = checkpoint; + l_ptr->exp_msg_count = START_CHANGEOVER; + } + + /* Clean up all queues: */ + + link_release_outqueue(l_ptr); + buf_discard(l_ptr->proto_msg_queue); + l_ptr->proto_msg_queue = NULL; + buf = l_ptr->oldest_deferred_in; + while (buf) { + struct sk_buff *next = buf->next; + buf_discard(buf); + buf = next; + } + if (!list_empty(&l_ptr->waiting_ports)) + link_wakeup_ports(l_ptr, 1); + + l_ptr->retransm_queue_head = 0; + l_ptr->retransm_queue_size = 0; + l_ptr->last_out = NULL; + l_ptr->first_out = NULL; + l_ptr->next_out = NULL; + l_ptr->unacked_window = 0; + l_ptr->checkpoint = 1; + l_ptr->next_out_no = 1; + l_ptr->deferred_inqueue_sz = 0; + l_ptr->oldest_deferred_in = NULL; + l_ptr->newest_deferred_in = NULL; + l_ptr->fsm_msg_cnt = 0; + l_ptr->stale_count = 0; + link_reset_statistics(l_ptr); + + link_send_event(cfg_link_event, l_ptr, 0); + if (!in_own_cluster(l_ptr->addr)) + link_send_event(disc_link_event, l_ptr, 0); +} + + +static void link_activate(struct link *l_ptr) +{ + l_ptr->next_in_no = 1; + node_link_up(l_ptr->owner, l_ptr); + bearer_add_dest(l_ptr->b_ptr, l_ptr->addr); + link_send_event(cfg_link_event, l_ptr, 1); + if (!in_own_cluster(l_ptr->addr)) + link_send_event(disc_link_event, l_ptr, 1); +} + +/** + * link_state_event - link finite state machine + * @l_ptr: pointer to link + * @event: state machine event to process + */ + +static void link_state_event(struct link *l_ptr, unsigned event) +{ + struct link *other; + u32 cont_intv = l_ptr->continuity_interval; + + if (!l_ptr->started && (event != STARTING_EVT)) + return; /* Not yet. */ + + if (link_blocked(l_ptr)) { + if (event == TIMEOUT_EVT) { + link_set_timer(l_ptr, cont_intv); + } + return; /* Changeover going on */ + } + dbg_link("STATE_EV: <%s> ", l_ptr->name); + + switch (l_ptr->state) { + case WORKING_WORKING: + dbg_link("WW/"); + switch (event) { + case TRAFFIC_MSG_EVT: + dbg_link("TRF-"); + /* fall through */ + case ACTIVATE_MSG: + dbg_link("ACT\n"); + break; + case TIMEOUT_EVT: + dbg_link("TIM "); + if (l_ptr->next_in_no != l_ptr->checkpoint) { + l_ptr->checkpoint = l_ptr->next_in_no; + if (bclink_acks_missing(l_ptr->owner)) { + link_send_proto_msg(l_ptr, STATE_MSG, + 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + } else if (l_ptr->max_pkt < l_ptr->max_pkt_target) { + link_send_proto_msg(l_ptr, STATE_MSG, + 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + } + link_set_timer(l_ptr, cont_intv); + break; + } + dbg_link(" -> WU\n"); + l_ptr->state = WORKING_UNKNOWN; + l_ptr->fsm_msg_cnt = 0; + link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv / 4); + break; + case RESET_MSG: + dbg_link("RES -> RR\n"); + link_reset(l_ptr); + l_ptr->state = RESET_RESET; + l_ptr->fsm_msg_cnt = 0; + link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + default: + err("Unknown link event %u in WW state\n", event); + } + break; + case WORKING_UNKNOWN: + dbg_link("WU/"); + switch (event) { + case TRAFFIC_MSG_EVT: + dbg_link("TRF-"); + case ACTIVATE_MSG: + dbg_link("ACT -> WW\n"); + l_ptr->state = WORKING_WORKING; + l_ptr->fsm_msg_cnt = 0; + link_set_timer(l_ptr, cont_intv); + break; + case RESET_MSG: + dbg_link("RES -> RR\n"); + link_reset(l_ptr); + l_ptr->state = RESET_RESET; + l_ptr->fsm_msg_cnt = 0; + link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + case TIMEOUT_EVT: + dbg_link("TIM "); + if (l_ptr->next_in_no != l_ptr->checkpoint) { + dbg_link("-> WW \n"); + l_ptr->state = WORKING_WORKING; + l_ptr->fsm_msg_cnt = 0; + l_ptr->checkpoint = l_ptr->next_in_no; + if (bclink_acks_missing(l_ptr->owner)) { + link_send_proto_msg(l_ptr, STATE_MSG, + 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + } + link_set_timer(l_ptr, cont_intv); + } else if (l_ptr->fsm_msg_cnt < l_ptr->abort_limit) { + dbg_link("Probing %u/%u,timer = %u ms)\n", + l_ptr->fsm_msg_cnt, l_ptr->abort_limit, + cont_intv / 4); + link_send_proto_msg(l_ptr, STATE_MSG, + 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv / 4); + } else { /* Link has failed */ + dbg_link("-> RU (%u probes unanswered)\n", + l_ptr->fsm_msg_cnt); + link_reset(l_ptr); + l_ptr->state = RESET_UNKNOWN; + l_ptr->fsm_msg_cnt = 0; + link_send_proto_msg(l_ptr, RESET_MSG, + 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + } + break; + default: + err("Unknown link event %u in WU state\n", event); + } + break; + case RESET_UNKNOWN: + dbg_link("RU/"); + switch (event) { + case TRAFFIC_MSG_EVT: + dbg_link("TRF-\n"); + break; + case ACTIVATE_MSG: + other = l_ptr->owner->active_links[0]; + if (other && link_working_unknown(other)) { + dbg_link("ACT\n"); + break; + } + dbg_link("ACT -> WW\n"); + l_ptr->state = WORKING_WORKING; + l_ptr->fsm_msg_cnt = 0; + link_activate(l_ptr); + link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + case RESET_MSG: + dbg_link("RES \n"); + dbg_link(" -> RR\n"); + l_ptr->state = RESET_RESET; + l_ptr->fsm_msg_cnt = 0; + link_send_proto_msg(l_ptr, ACTIVATE_MSG, 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + case STARTING_EVT: + dbg_link("START-"); + l_ptr->started = 1; + /* fall through */ + case TIMEOUT_EVT: + dbg_link("TIM \n"); + link_send_proto_msg(l_ptr, RESET_MSG, 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + default: + err("Unknown link event %u in RU state\n", event); + } + break; + case RESET_RESET: + dbg_link("RR/ "); + switch (event) { + case TRAFFIC_MSG_EVT: + dbg_link("TRF-"); + /* fall through */ + case ACTIVATE_MSG: + other = l_ptr->owner->active_links[0]; + if (other && link_working_unknown(other)) { + dbg_link("ACT\n"); + break; + } + dbg_link("ACT -> WW\n"); + l_ptr->state = WORKING_WORKING; + l_ptr->fsm_msg_cnt = 0; + link_activate(l_ptr); + link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + break; + case RESET_MSG: + dbg_link("RES\n"); + break; + case TIMEOUT_EVT: + dbg_link("TIM\n"); + link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0); + l_ptr->fsm_msg_cnt++; + link_set_timer(l_ptr, cont_intv); + dbg_link("fsm_msg_cnt %u\n", l_ptr->fsm_msg_cnt); + break; + default: + err("Unknown link event %u in RR state\n", event); + } + break; + default: + err("Unknown link state %u/%u\n", l_ptr->state, event); + } +} + +/* + * link_bundle_buf(): Append contents of a buffer to + * the tail of an existing one. + */ + +static int link_bundle_buf(struct link *l_ptr, + struct sk_buff *bundler, + struct sk_buff *buf) +{ + struct tipc_msg *bundler_msg = buf_msg(bundler); + struct tipc_msg *msg = buf_msg(buf); + u32 size = msg_size(msg); + u32 to_pos = align(msg_size(bundler_msg)); + u32 rest = link_max_pkt(l_ptr) - to_pos; + + if (msg_user(bundler_msg) != MSG_BUNDLER) + return 0; + if (msg_type(bundler_msg) != OPEN_MSG) + return 0; + if (rest < align(size)) + return 0; + + skb_put(bundler, (to_pos - msg_size(bundler_msg)) + size); + memcpy(bundler->data + to_pos, buf->data, size); + msg_set_size(bundler_msg, to_pos + size); + msg_set_msgcnt(bundler_msg, msg_msgcnt(bundler_msg) + 1); + dbg("Packed msg # %u(%u octets) into pos %u in buf(#%u)\n", + msg_msgcnt(bundler_msg), size, to_pos, msg_seqno(bundler_msg)); + msg_dbg(msg, "PACKD:"); + buf_discard(buf); + l_ptr->stats.sent_bundled++; + return 1; +} + +static inline void link_add_to_outqueue(struct link *l_ptr, + struct sk_buff *buf, + struct tipc_msg *msg) +{ + u32 ack = mod(l_ptr->next_in_no - 1); + u32 seqno = mod(l_ptr->next_out_no++); + + msg_set_word(msg, 2, ((ack << 16) | seqno)); + msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); + buf->next = NULL; + if (l_ptr->first_out) { + l_ptr->last_out->next = buf; + l_ptr->last_out = buf; + } else + l_ptr->first_out = l_ptr->last_out = buf; + l_ptr->out_queue_size++; +} + +/* + * link_send_buf() is the 'full path' for messages, called from + * inside TIPC when the 'fast path' in tipc_send_buf + * has failed, and from link_send() + */ + +int link_send_buf(struct link *l_ptr, struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + u32 size = msg_size(msg); + u32 dsz = msg_data_sz(msg); + u32 queue_size = l_ptr->out_queue_size; + u32 imp = msg_tot_importance(msg); + u32 queue_limit = l_ptr->queue_limit[imp]; + u32 max_packet = link_max_pkt(l_ptr); + + msg_set_prevnode(msg, tipc_own_addr); /* If routed message */ + + /* Match msg importance against queue limits: */ + + if (unlikely(queue_size >= queue_limit)) { + if (imp <= TIPC_CRITICAL_IMPORTANCE) { + return link_schedule_port(l_ptr, msg_origport(msg), + size); + } + msg_dbg(msg, "TIPC: Congestion, throwing away\n"); + buf_discard(buf); + if (imp > CONN_MANAGER) { + warn("Resetting <%s>, send queue full", l_ptr->name); + link_reset(l_ptr); + } + return dsz; + } + + /* Fragmentation needed ? */ + + if (size > max_packet) + return link_send_long_buf(l_ptr, buf); + + /* Packet can be queued or sent: */ + + if (queue_size > l_ptr->stats.max_queue_sz) + l_ptr->stats.max_queue_sz = queue_size; + + if (likely(!bearer_congested(l_ptr->b_ptr, l_ptr) && + !link_congested(l_ptr))) { + link_add_to_outqueue(l_ptr, buf, msg); + + if (likely(bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr))) { + l_ptr->unacked_window = 0; + } else { + bearer_schedule(l_ptr->b_ptr, l_ptr); + l_ptr->stats.bearer_congs++; + l_ptr->next_out = buf; + } + return dsz; + } + /* Congestion: can message be bundled ?: */ + + if ((msg_user(msg) != CHANGEOVER_PROTOCOL) && + (msg_user(msg) != MSG_FRAGMENTER)) { + + /* Try adding message to an existing bundle */ + + if (l_ptr->next_out && + link_bundle_buf(l_ptr, l_ptr->last_out, buf)) { + bearer_resolve_congestion(l_ptr->b_ptr, l_ptr); + return dsz; + } + + /* Try creating a new bundle */ + + if (size <= max_packet * 2 / 3) { + struct sk_buff *bundler = buf_acquire(max_packet); + struct tipc_msg bundler_hdr; + + if (bundler) { + msg_init(&bundler_hdr, MSG_BUNDLER, OPEN_MSG, + TIPC_OK, INT_H_SIZE, l_ptr->addr); + memcpy(bundler->data, (unchar *)&bundler_hdr, + INT_H_SIZE); + skb_trim(bundler, INT_H_SIZE); + link_bundle_buf(l_ptr, bundler, buf); + buf = bundler; + msg = buf_msg(buf); + l_ptr->stats.sent_bundles++; + } + } + } + if (!l_ptr->next_out) + l_ptr->next_out = buf; + link_add_to_outqueue(l_ptr, buf, msg); + bearer_resolve_congestion(l_ptr->b_ptr, l_ptr); + return dsz; +} + +/* + * link_send(): same as link_send_buf(), but the link to use has + * not been selected yet, and the the owner node is not locked + * Called by TIPC internal users, e.g. the name distributor + */ + +int link_send(struct sk_buff *buf, u32 dest, u32 selector) +{ + struct link *l_ptr; + struct node *n_ptr; + int res = -ELINKCONG; + + read_lock_bh(&net_lock); + n_ptr = node_select(dest, selector); + if (n_ptr) { + node_lock(n_ptr); + l_ptr = n_ptr->active_links[selector & 1]; + dbg("link_send: found link %x for dest %x\n", l_ptr, dest); + if (l_ptr) { + res = link_send_buf(l_ptr, buf); + } + node_unlock(n_ptr); + } else { + dbg("Attempt to send msg to unknown node:\n"); + msg_dbg(buf_msg(buf),">>>"); + buf_discard(buf); + } + read_unlock_bh(&net_lock); + return res; +} + +/* + * link_send_buf_fast: Entry for data messages where the + * destination link is known and the header is complete, + * inclusive total message length. Very time critical. + * Link is locked. Returns user data length. + */ + +static inline int link_send_buf_fast(struct link *l_ptr, struct sk_buff *buf, + u32 *used_max_pkt) +{ + struct tipc_msg *msg = buf_msg(buf); + int res = msg_data_sz(msg); + + if (likely(!link_congested(l_ptr))) { + if (likely(msg_size(msg) <= link_max_pkt(l_ptr))) { + if (likely(list_empty(&l_ptr->b_ptr->cong_links))) { + link_add_to_outqueue(l_ptr, buf, msg); + if (likely(bearer_send(l_ptr->b_ptr, buf, + &l_ptr->media_addr))) { + l_ptr->unacked_window = 0; + msg_dbg(msg,"SENT_FAST:"); + return res; + } + dbg("failed sent fast...\n"); + bearer_schedule(l_ptr->b_ptr, l_ptr); + l_ptr->stats.bearer_congs++; + l_ptr->next_out = buf; + return res; + } + } + else + *used_max_pkt = link_max_pkt(l_ptr); + } + return link_send_buf(l_ptr, buf); /* All other cases */ +} + +/* + * tipc_send_buf_fast: Entry for data messages where the + * destination node is known and the header is complete, + * inclusive total message length. + * Returns user data length. + */ +int tipc_send_buf_fast(struct sk_buff *buf, u32 destnode) +{ + struct link *l_ptr; + struct node *n_ptr; + int res; + u32 selector = msg_origport(buf_msg(buf)) & 1; + u32 dummy; + + if (destnode == tipc_own_addr) + return port_recv_msg(buf); + + read_lock_bh(&net_lock); + n_ptr = node_select(destnode, selector); + if (likely(n_ptr)) { + node_lock(n_ptr); + l_ptr = n_ptr->active_links[selector]; + dbg("send_fast: buf %x selected %x, destnode = %x\n", + buf, l_ptr, destnode); + if (likely(l_ptr)) { + res = link_send_buf_fast(l_ptr, buf, &dummy); + node_unlock(n_ptr); + read_unlock_bh(&net_lock); + return res; + } + node_unlock(n_ptr); + } + read_unlock_bh(&net_lock); + res = msg_data_sz(buf_msg(buf)); + tipc_reject_msg(buf, TIPC_ERR_NO_NODE); + return res; +} + + +/* + * link_send_sections_fast: Entry for messages where the + * destination processor is known and the header is complete, + * except for total message length. + * Returns user data length or errno. + */ +int link_send_sections_fast(struct port *sender, + struct iovec const *msg_sect, + const u32 num_sect, + u32 destaddr) +{ + struct tipc_msg *hdr = &sender->publ.phdr; + struct link *l_ptr; + struct sk_buff *buf; + struct node *node; + int res; + u32 selector = msg_origport(hdr) & 1; + + assert(destaddr != tipc_own_addr); + +again: + /* + * Try building message using port's max_pkt hint. + * (Must not hold any locks while building message.) + */ + + res = msg_build(hdr, msg_sect, num_sect, sender->max_pkt, + !sender->user_port, &buf); + + read_lock_bh(&net_lock); + node = node_select(destaddr, selector); + if (likely(node)) { + node_lock(node); + l_ptr = node->active_links[selector]; + if (likely(l_ptr)) { + if (likely(buf)) { + res = link_send_buf_fast(l_ptr, buf, + &sender->max_pkt); + if (unlikely(res < 0)) + buf_discard(buf); +exit: + node_unlock(node); + read_unlock_bh(&net_lock); + return res; + } + + /* Exit if build request was invalid */ + + if (unlikely(res < 0)) + goto exit; + + /* Exit if link (or bearer) is congested */ + + if (link_congested(l_ptr) || + !list_empty(&l_ptr->b_ptr->cong_links)) { + res = link_schedule_port(l_ptr, + sender->publ.ref, res); + goto exit; + } + + /* + * Message size exceeds max_pkt hint; update hint, + * then re-try fast path or fragment the message + */ + + sender->max_pkt = link_max_pkt(l_ptr); + node_unlock(node); + read_unlock_bh(&net_lock); + + + if ((msg_hdr_sz(hdr) + res) <= sender->max_pkt) + goto again; + + return link_send_sections_long(sender, msg_sect, + num_sect, destaddr); + } + node_unlock(node); + } + read_unlock_bh(&net_lock); + + /* Couldn't find a link to the destination node */ + + if (buf) + return tipc_reject_msg(buf, TIPC_ERR_NO_NODE); + if (res >= 0) + return port_reject_sections(sender, hdr, msg_sect, num_sect, + TIPC_ERR_NO_NODE); + return res; +} + +/* + * link_send_sections_long(): Entry for long messages where the + * destination node is known and the header is complete, + * inclusive total message length. + * Link and bearer congestion status have been checked to be ok, + * and are ignored if they change. + * + * Note that fragments do not use the full link MTU so that they won't have + * to undergo refragmentation if link changeover causes them to be sent + * over another link with an additional tunnel header added as prefix. + * (Refragmentation will still occur if the other link has a smaller MTU.) + * + * Returns user data length or errno. + */ +static int link_send_sections_long(struct port *sender, + struct iovec const *msg_sect, + u32 num_sect, + u32 destaddr) +{ + struct link *l_ptr; + struct node *node; + struct tipc_msg *hdr = &sender->publ.phdr; + u32 dsz = msg_data_sz(hdr); + u32 max_pkt,fragm_sz,rest; + struct tipc_msg fragm_hdr; + struct sk_buff *buf,*buf_chain,*prev; + u32 fragm_crs,fragm_rest,hsz,sect_rest; + const unchar *sect_crs; + int curr_sect; + u32 fragm_no; + +again: + fragm_no = 1; + max_pkt = sender->max_pkt - INT_H_SIZE; + /* leave room for tunnel header in case of link changeover */ + fragm_sz = max_pkt - INT_H_SIZE; + /* leave room for fragmentation header in each fragment */ + rest = dsz; + fragm_crs = 0; + fragm_rest = 0; + sect_rest = 0; + sect_crs = 0; + curr_sect = -1; + + /* Prepare reusable fragment header: */ + + msg_dbg(hdr, ">FRAGMENTING>"); + msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT, + TIPC_OK, INT_H_SIZE, msg_destnode(hdr)); + msg_set_link_selector(&fragm_hdr, sender->publ.ref); + msg_set_size(&fragm_hdr, max_pkt); + msg_set_fragm_no(&fragm_hdr, 1); + + /* Prepare header of first fragment: */ + + buf_chain = buf = buf_acquire(max_pkt); + if (!buf) + return -ENOMEM; + buf->next = NULL; + memcpy(buf->data, (unchar *)&fragm_hdr, INT_H_SIZE); + hsz = msg_hdr_sz(hdr); + memcpy(buf->data + INT_H_SIZE, (unchar *)hdr, hsz); + msg_dbg(buf_msg(buf), ">BUILD>"); + + /* Chop up message: */ + + fragm_crs = INT_H_SIZE + hsz; + fragm_rest = fragm_sz - hsz; + + do { /* For all sections */ + u32 sz; + + if (!sect_rest) { + sect_rest = msg_sect[++curr_sect].iov_len; + sect_crs = (const unchar *)msg_sect[curr_sect].iov_base; + } + + if (sect_rest < fragm_rest) + sz = sect_rest; + else + sz = fragm_rest; + + if (likely(!sender->user_port)) { + if (copy_from_user(buf->data + fragm_crs, sect_crs, sz)) { +error: + for (; buf_chain; buf_chain = buf) { + buf = buf_chain->next; + buf_discard(buf_chain); + } + return -EFAULT; + } + } else + memcpy(buf->data + fragm_crs, sect_crs, sz); + + sect_crs += sz; + sect_rest -= sz; + fragm_crs += sz; + fragm_rest -= sz; + rest -= sz; + + if (!fragm_rest && rest) { + + /* Initiate new fragment: */ + if (rest <= fragm_sz) { + fragm_sz = rest; + msg_set_type(&fragm_hdr,LAST_FRAGMENT); + } else { + msg_set_type(&fragm_hdr, FRAGMENT); + } + msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); + msg_set_fragm_no(&fragm_hdr, ++fragm_no); + prev = buf; + buf = buf_acquire(fragm_sz + INT_H_SIZE); + if (!buf) + goto error; + + buf->next = NULL; + prev->next = buf; + memcpy(buf->data, (unchar *)&fragm_hdr, INT_H_SIZE); + fragm_crs = INT_H_SIZE; + fragm_rest = fragm_sz; + msg_dbg(buf_msg(buf)," >BUILD>"); + } + } + while (rest > 0); + + /* + * Now we have a buffer chain. Select a link and check + * that packet size is still OK + */ + node = node_select(destaddr, sender->publ.ref & 1); + if (likely(node)) { + node_lock(node); + l_ptr = node->active_links[sender->publ.ref & 1]; + if (!l_ptr) { + node_unlock(node); + goto reject; + } + if (link_max_pkt(l_ptr) < max_pkt) { + sender->max_pkt = link_max_pkt(l_ptr); + node_unlock(node); + for (; buf_chain; buf_chain = buf) { + buf = buf_chain->next; + buf_discard(buf_chain); + } + goto again; + } + } else { +reject: + for (; buf_chain; buf_chain = buf) { + buf = buf_chain->next; + buf_discard(buf_chain); + } + return port_reject_sections(sender, hdr, msg_sect, num_sect, + TIPC_ERR_NO_NODE); + } + + /* Append whole chain to send queue: */ + + buf = buf_chain; + l_ptr->long_msg_seq_no = mod(l_ptr->long_msg_seq_no + 1); + if (!l_ptr->next_out) + l_ptr->next_out = buf_chain; + l_ptr->stats.sent_fragmented++; + while (buf) { + struct sk_buff *next = buf->next; + struct tipc_msg *msg = buf_msg(buf); + + l_ptr->stats.sent_fragments++; + msg_set_long_msgno(msg, l_ptr->long_msg_seq_no); + link_add_to_outqueue(l_ptr, buf, msg); + msg_dbg(msg, ">ADD>"); + buf = next; + } + + /* Send it, if possible: */ + + link_push_queue(l_ptr); + node_unlock(node); + return dsz; +} + +/* + * link_push_packet: Push one unsent packet to the media + */ +u32 link_push_packet(struct link *l_ptr) +{ + struct sk_buff *buf = l_ptr->first_out; + u32 r_q_size = l_ptr->retransm_queue_size; + u32 r_q_head = l_ptr->retransm_queue_head; + + /* Step to position where retransmission failed, if any, */ + /* consider that buffers may have been released in meantime */ + + if (r_q_size && buf) { + u32 last = lesser(mod(r_q_head + r_q_size), + link_last_sent(l_ptr)); + u32 first = msg_seqno(buf_msg(buf)); + + while (buf && less(first, r_q_head)) { + first = mod(first + 1); + buf = buf->next; + } + l_ptr->retransm_queue_head = r_q_head = first; + l_ptr->retransm_queue_size = r_q_size = mod(last - first); + } + + /* Continue retransmission now, if there is anything: */ + + if (r_q_size && buf && !skb_cloned(buf)) { + msg_set_ack(buf_msg(buf), mod(l_ptr->next_in_no - 1)); + msg_set_bcast_ack(buf_msg(buf), l_ptr->owner->bclink.last_in); + if (bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { + msg_dbg(buf_msg(buf), ">DEF-RETR>"); + l_ptr->retransm_queue_head = mod(++r_q_head); + l_ptr->retransm_queue_size = --r_q_size; + l_ptr->stats.retransmitted++; + return TIPC_OK; + } else { + l_ptr->stats.bearer_congs++; + msg_dbg(buf_msg(buf), "|>DEF-RETR>"); + return PUSH_FAILED; + } + } + + /* Send deferred protocol message, if any: */ + + buf = l_ptr->proto_msg_queue; + if (buf) { + msg_set_ack(buf_msg(buf), mod(l_ptr->next_in_no - 1)); + msg_set_bcast_ack(buf_msg(buf),l_ptr->owner->bclink.last_in); + if (bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { + msg_dbg(buf_msg(buf), ">DEF-PROT>"); + l_ptr->unacked_window = 0; + buf_discard(buf); + l_ptr->proto_msg_queue = 0; + return TIPC_OK; + } else { + msg_dbg(buf_msg(buf), "|>DEF-PROT>"); + l_ptr->stats.bearer_congs++; + return PUSH_FAILED; + } + } + + /* Send one deferred data message, if send window not full: */ + + buf = l_ptr->next_out; + if (buf) { + struct tipc_msg *msg = buf_msg(buf); + u32 next = msg_seqno(msg); + u32 first = msg_seqno(buf_msg(l_ptr->first_out)); + + if (mod(next - first) < l_ptr->queue_limit[0]) { + msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); + msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); + if (bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { + if (msg_user(msg) == MSG_BUNDLER) + msg_set_type(msg, CLOSED_MSG); + msg_dbg(msg, ">PUSH-DATA>"); + l_ptr->next_out = buf->next; + return TIPC_OK; + } else { + msg_dbg(msg, "|PUSH-DATA|"); + l_ptr->stats.bearer_congs++; + return PUSH_FAILED; + } + } + } + return PUSH_FINISHED; +} + +/* + * push_queue(): push out the unsent messages of a link where + * congestion has abated. Node is locked + */ +void link_push_queue(struct link *l_ptr) +{ + u32 res; + + if (bearer_congested(l_ptr->b_ptr, l_ptr)) + return; + + do { + res = link_push_packet(l_ptr); + } + while (res == TIPC_OK); + if (res == PUSH_FAILED) + bearer_schedule(l_ptr->b_ptr, l_ptr); +} + +void link_retransmit(struct link *l_ptr, struct sk_buff *buf, + u32 retransmits) +{ + struct tipc_msg *msg; + + dbg("Retransmitting %u in link %x\n", retransmits, l_ptr); + + if (bearer_congested(l_ptr->b_ptr, l_ptr) && buf && !skb_cloned(buf)) { + msg_dbg(buf_msg(buf), ">NO_RETR->BCONG>"); + dbg_print_link(l_ptr, " "); + l_ptr->retransm_queue_head = msg_seqno(buf_msg(buf)); + l_ptr->retransm_queue_size = retransmits; + return; + } + while (retransmits && (buf != l_ptr->next_out) && buf && !skb_cloned(buf)) { + msg = buf_msg(buf); + msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); + msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); + if (bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { + /* Catch if retransmissions fail repeatedly: */ + if (l_ptr->last_retransmitted == msg_seqno(msg)) { + if (++l_ptr->stale_count > 100) { + msg_print(CONS, buf_msg(buf), ">RETR>"); + info("...Retransmitted %u times\n", + l_ptr->stale_count); + link_print(l_ptr, CONS, "Resetting Link\n");; + link_reset(l_ptr); + break; + } + } else { + l_ptr->stale_count = 0; + } + l_ptr->last_retransmitted = msg_seqno(msg); + + msg_dbg(buf_msg(buf), ">RETR>"); + buf = buf->next; + retransmits--; + l_ptr->stats.retransmitted++; + } else { + bearer_schedule(l_ptr->b_ptr, l_ptr); + l_ptr->stats.bearer_congs++; + l_ptr->retransm_queue_head = msg_seqno(buf_msg(buf)); + l_ptr->retransm_queue_size = retransmits; + return; + } + } + l_ptr->retransm_queue_head = l_ptr->retransm_queue_size = 0; +} + +/* + * link_recv_non_seq: Receive packets which are outside + * the link sequence flow + */ + +static void link_recv_non_seq(struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + + if (msg_user(msg) == LINK_CONFIG) + disc_recv_msg(buf); + else + bclink_recv_pkt(buf); +} + +/** + * link_insert_deferred_queue - insert deferred messages back into receive chain + */ + +static struct sk_buff *link_insert_deferred_queue(struct link *l_ptr, + struct sk_buff *buf) +{ + u32 seq_no; + + if (l_ptr->oldest_deferred_in == NULL) + return buf; + + seq_no = msg_seqno(buf_msg(l_ptr->oldest_deferred_in)); + if (seq_no == mod(l_ptr->next_in_no)) { + l_ptr->newest_deferred_in->next = buf; + buf = l_ptr->oldest_deferred_in; + l_ptr->oldest_deferred_in = NULL; + l_ptr->deferred_inqueue_sz = 0; + } + return buf; +} + +void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) +{ + read_lock_bh(&net_lock); + while (head) { + struct bearer *b_ptr; + struct node *n_ptr; + struct link *l_ptr; + struct sk_buff *crs; + struct sk_buff *buf = head; + struct tipc_msg *msg = buf_msg(buf); + u32 seq_no = msg_seqno(msg); + u32 ackd = msg_ack(msg); + u32 released = 0; + int type; + + b_ptr = (struct bearer *)tb_ptr; + TIPC_SKB_CB(buf)->handle = b_ptr; + + head = head->next; + if (unlikely(msg_version(msg) != TIPC_VERSION)) + goto cont; +#if 0 + if (msg_user(msg) != LINK_PROTOCOL) +#endif + msg_dbg(msg,"<REC<"); + + if (unlikely(msg_non_seq(msg))) { + link_recv_non_seq(buf); + continue; + } + n_ptr = node_find(msg_prevnode(msg)); + if (unlikely(!n_ptr)) + goto cont; + + node_lock(n_ptr); + l_ptr = n_ptr->links[b_ptr->identity]; + if (unlikely(!l_ptr)) { + node_unlock(n_ptr); + goto cont; + } + /* + * Release acked messages + */ + if (less(n_ptr->bclink.acked, msg_bcast_ack(msg))) { + if (node_is_up(n_ptr) && n_ptr->bclink.supported) + bclink_acknowledge(n_ptr, msg_bcast_ack(msg)); + } + + crs = l_ptr->first_out; + while ((crs != l_ptr->next_out) && + less_eq(msg_seqno(buf_msg(crs)), ackd)) { + struct sk_buff *next = crs->next; + + buf_discard(crs); + crs = next; + released++; + } + if (released) { + l_ptr->first_out = crs; + l_ptr->out_queue_size -= released; + } + if (unlikely(l_ptr->next_out)) + link_push_queue(l_ptr); + if (unlikely(!list_empty(&l_ptr->waiting_ports))) + link_wakeup_ports(l_ptr, 0); + if (unlikely(++l_ptr->unacked_window >= TIPC_MIN_LINK_WIN)) { + l_ptr->stats.sent_acks++; + link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); + } + +protocol_check: + if (likely(link_working_working(l_ptr))) { + if (likely(seq_no == mod(l_ptr->next_in_no))) { + l_ptr->next_in_no++; + if (unlikely(l_ptr->oldest_deferred_in)) + head = link_insert_deferred_queue(l_ptr, + head); + if (likely(msg_is_dest(msg, tipc_own_addr))) { +deliver: + if (likely(msg_isdata(msg))) { + node_unlock(n_ptr); + port_recv_msg(buf); + continue; + } + switch (msg_user(msg)) { + case MSG_BUNDLER: + l_ptr->stats.recv_bundles++; + l_ptr->stats.recv_bundled += + msg_msgcnt(msg); + node_unlock(n_ptr); + link_recv_bundle(buf); + continue; + case ROUTE_DISTRIBUTOR: + node_unlock(n_ptr); + cluster_recv_routing_table(buf); + continue; + case NAME_DISTRIBUTOR: + node_unlock(n_ptr); + named_recv(buf); + continue; + case CONN_MANAGER: + node_unlock(n_ptr); + port_recv_proto_msg(buf); + continue; + case MSG_FRAGMENTER: + l_ptr->stats.recv_fragments++; + if (link_recv_fragment( + &l_ptr->defragm_buf, + &buf, &msg)) { + l_ptr->stats.recv_fragmented++; + goto deliver; + } + break; + case CHANGEOVER_PROTOCOL: + type = msg_type(msg); + if (link_recv_changeover_msg( + &l_ptr, &buf)) { + msg = buf_msg(buf); + seq_no = msg_seqno(msg); + TIPC_SKB_CB(buf)->handle + = b_ptr; + if (type == ORIGINAL_MSG) + goto deliver; + goto protocol_check; + } + break; + } + } + node_unlock(n_ptr); + net_route_msg(buf); + continue; + } + link_handle_out_of_seq_msg(l_ptr, buf); + head = link_insert_deferred_queue(l_ptr, head); + node_unlock(n_ptr); + continue; + } + + if (msg_user(msg) == LINK_PROTOCOL) { + link_recv_proto_msg(l_ptr, buf); + head = link_insert_deferred_queue(l_ptr, head); + node_unlock(n_ptr); + continue; + } + msg_dbg(msg,"NSEQ<REC<"); + link_state_event(l_ptr, TRAFFIC_MSG_EVT); + + if (link_working_working(l_ptr)) { + /* Re-insert in front of queue */ + msg_dbg(msg,"RECV-REINS:"); + buf->next = head; + head = buf; + node_unlock(n_ptr); + continue; + } + node_unlock(n_ptr); +cont: + buf_discard(buf); + } + read_unlock_bh(&net_lock); +} + +/* + * link_defer_buf(): Sort a received out-of-sequence packet + * into the deferred reception queue. + * Returns the increase of the queue length,i.e. 0 or 1 + */ + +u32 link_defer_pkt(struct sk_buff **head, + struct sk_buff **tail, + struct sk_buff *buf) +{ + struct sk_buff *prev = 0; + struct sk_buff *crs = *head; + u32 seq_no = msg_seqno(buf_msg(buf)); + + buf->next = NULL; + + /* Empty queue ? */ + if (*head == NULL) { + *head = *tail = buf; + return 1; + } + + /* Last ? */ + if (less(msg_seqno(buf_msg(*tail)), seq_no)) { + (*tail)->next = buf; + *tail = buf; + return 1; + } + + /* Scan through queue and sort it in */ + do { + struct tipc_msg *msg = buf_msg(crs); + + if (less(seq_no, msg_seqno(msg))) { + buf->next = crs; + if (prev) + prev->next = buf; + else + *head = buf; + return 1; + } + if (seq_no == msg_seqno(msg)) { + break; + } + prev = crs; + crs = crs->next; + } + while (crs); + + /* Message is a duplicate of an existing message */ + + buf_discard(buf); + return 0; +} + +/** + * link_handle_out_of_seq_msg - handle arrival of out-of-sequence packet + */ + +static void link_handle_out_of_seq_msg(struct link *l_ptr, + struct sk_buff *buf) +{ + u32 seq_no = msg_seqno(buf_msg(buf)); + + if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) { + link_recv_proto_msg(l_ptr, buf); + return; + } + + dbg("rx OOS msg: seq_no %u, expecting %u (%u)\n", + seq_no, mod(l_ptr->next_in_no), l_ptr->next_in_no); + + /* Record OOS packet arrival (force mismatch on next timeout) */ + + l_ptr->checkpoint--; + + /* + * Discard packet if a duplicate; otherwise add it to deferred queue + * and notify peer of gap as per protocol specification + */ + + if (less(seq_no, mod(l_ptr->next_in_no))) { + l_ptr->stats.duplicates++; + buf_discard(buf); + return; + } + + if (link_defer_pkt(&l_ptr->oldest_deferred_in, + &l_ptr->newest_deferred_in, buf)) { + l_ptr->deferred_inqueue_sz++; + l_ptr->stats.deferred_recv++; + if ((l_ptr->deferred_inqueue_sz % 16) == 1) + link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); + } else + l_ptr->stats.duplicates++; +} + +/* + * Send protocol message to the other endpoint. + */ +void link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg, + u32 gap, u32 tolerance, u32 priority, u32 ack_mtu) +{ + struct sk_buff *buf = 0; + struct tipc_msg *msg = l_ptr->pmsg; + u32 msg_size = sizeof(l_ptr->proto_msg); + + if (link_blocked(l_ptr)) + return; + msg_set_type(msg, msg_typ); + msg_set_net_plane(msg, l_ptr->b_ptr->net_plane); + msg_set_bcast_ack(msg, mod(l_ptr->owner->bclink.last_in)); + msg_set_last_bcast(msg, bclink_get_last_sent()); + + if (msg_typ == STATE_MSG) { + u32 next_sent = mod(l_ptr->next_out_no); + + if (!link_is_up(l_ptr)) + return; + if (l_ptr->next_out) + next_sent = msg_seqno(buf_msg(l_ptr->next_out)); + msg_set_next_sent(msg, next_sent); + if (l_ptr->oldest_deferred_in) { + u32 rec = msg_seqno(buf_msg(l_ptr->oldest_deferred_in)); + gap = mod(rec - mod(l_ptr->next_in_no)); + } + msg_set_seq_gap(msg, gap); + if (gap) + l_ptr->stats.sent_nacks++; + msg_set_link_tolerance(msg, tolerance); + msg_set_linkprio(msg, priority); + msg_set_max_pkt(msg, ack_mtu); + msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); + msg_set_probe(msg, probe_msg != 0); + if (probe_msg) { + u32 mtu = l_ptr->max_pkt; + + if ((mtu < l_ptr->max_pkt_target) && + link_working_working(l_ptr) && + l_ptr->fsm_msg_cnt) { + msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3; + if (l_ptr->max_pkt_probes == 10) { + l_ptr->max_pkt_target = (msg_size - 4); + l_ptr->max_pkt_probes = 0; + msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3; + } + l_ptr->max_pkt_probes++; + } + + l_ptr->stats.sent_probes++; + } + l_ptr->stats.sent_states++; + } else { /* RESET_MSG or ACTIVATE_MSG */ + msg_set_ack(msg, mod(l_ptr->reset_checkpoint - 1)); + msg_set_seq_gap(msg, 0); + msg_set_next_sent(msg, 1); + msg_set_link_tolerance(msg, l_ptr->tolerance); + msg_set_linkprio(msg, l_ptr->priority); + msg_set_max_pkt(msg, l_ptr->max_pkt_target); + } + + if (node_has_redundant_links(l_ptr->owner)) { + msg_set_redundant_link(msg); + } else { + msg_clear_redundant_link(msg); + } + msg_set_linkprio(msg, l_ptr->priority); + + /* Ensure sequence number will not fit : */ + + msg_set_seqno(msg, mod(l_ptr->next_out_no + (0xffff/2))); + + /* Congestion? */ + + if (bearer_congested(l_ptr->b_ptr, l_ptr)) { + if (!l_ptr->proto_msg_queue) { + l_ptr->proto_msg_queue = + buf_acquire(sizeof(l_ptr->proto_msg)); + } + buf = l_ptr->proto_msg_queue; + if (!buf) + return; + memcpy(buf->data, (unchar *)msg, sizeof(l_ptr->proto_msg)); + return; + } + msg_set_timestamp(msg, jiffies_to_msecs(jiffies)); + + /* Message can be sent */ + + msg_dbg(msg, ">>"); + + buf = buf_acquire(msg_size); + if (!buf) + return; + + memcpy(buf->data, (unchar *)msg, sizeof(l_ptr->proto_msg)); + msg_set_size(buf_msg(buf), msg_size); + + if (bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { + l_ptr->unacked_window = 0; + buf_discard(buf); + return; + } + + /* New congestion */ + bearer_schedule(l_ptr->b_ptr, l_ptr); + l_ptr->proto_msg_queue = buf; + l_ptr->stats.bearer_congs++; +} + +/* + * Receive protocol message : + * Note that network plane id propagates through the network, and may + * change at any time. The node with lowest address rules + */ + +static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf) +{ + u32 rec_gap = 0; + u32 max_pkt_info; + u32 max_pkt_ack; + u32 msg_tol; + struct tipc_msg *msg = buf_msg(buf); + + dbg("AT(%u):", jiffies_to_msecs(jiffies)); + msg_dbg(msg, "<<"); + if (link_blocked(l_ptr)) + goto exit; + + /* record unnumbered packet arrival (force mismatch on next timeout) */ + + l_ptr->checkpoint--; + + if (l_ptr->b_ptr->net_plane != msg_net_plane(msg)) + if (tipc_own_addr > msg_prevnode(msg)) + l_ptr->b_ptr->net_plane = msg_net_plane(msg); + + l_ptr->owner->permit_changeover = msg_redundant_link(msg); + + switch (msg_type(msg)) { + + case RESET_MSG: + if (!link_working_unknown(l_ptr) && l_ptr->peer_session) { + if (msg_session(msg) == l_ptr->peer_session) { + dbg("Duplicate RESET: %u<->%u\n", + msg_session(msg), l_ptr->peer_session); + break; /* duplicate: ignore */ + } + } + /* fall thru' */ + case ACTIVATE_MSG: + /* Update link settings according other endpoint's values */ + + strcpy((strrchr(l_ptr->name, ':') + 1), (char *)msg_data(msg)); + + if ((msg_tol = msg_link_tolerance(msg)) && + (msg_tol > l_ptr->tolerance)) + link_set_supervision_props(l_ptr, msg_tol); + + if (msg_linkprio(msg) > l_ptr->priority) + l_ptr->priority = msg_linkprio(msg); + + max_pkt_info = msg_max_pkt(msg); + if (max_pkt_info) { + if (max_pkt_info < l_ptr->max_pkt_target) + l_ptr->max_pkt_target = max_pkt_info; + if (l_ptr->max_pkt > l_ptr->max_pkt_target) + l_ptr->max_pkt = l_ptr->max_pkt_target; + } else { + l_ptr->max_pkt = l_ptr->max_pkt_target; + } + l_ptr->owner->bclink.supported = (max_pkt_info != 0); + + link_state_event(l_ptr, msg_type(msg)); + + l_ptr->peer_session = msg_session(msg); + l_ptr->peer_bearer_id = msg_bearer_id(msg); + + /* Synchronize broadcast sequence numbers */ + if (!node_has_redundant_links(l_ptr->owner)) { + l_ptr->owner->bclink.last_in = mod(msg_last_bcast(msg)); + } + break; + case STATE_MSG: + + if ((msg_tol = msg_link_tolerance(msg))) + link_set_supervision_props(l_ptr, msg_tol); + + if (msg_linkprio(msg) && + (msg_linkprio(msg) != l_ptr->priority)) { + warn("Changing prio <%s>: %u->%u\n", + l_ptr->name, l_ptr->priority, msg_linkprio(msg)); + l_ptr->priority = msg_linkprio(msg); + link_reset(l_ptr); /* Enforce change to take effect */ + break; + } + link_state_event(l_ptr, TRAFFIC_MSG_EVT); + l_ptr->stats.recv_states++; + if (link_reset_unknown(l_ptr)) + break; + + if (less_eq(mod(l_ptr->next_in_no), msg_next_sent(msg))) { + rec_gap = mod(msg_next_sent(msg) - + mod(l_ptr->next_in_no)); + } + + max_pkt_ack = msg_max_pkt(msg); + if (max_pkt_ack > l_ptr->max_pkt) { + dbg("Link <%s> updated MTU %u -> %u\n", + l_ptr->name, l_ptr->max_pkt, max_pkt_ack); + l_ptr->max_pkt = max_pkt_ack; + l_ptr->max_pkt_probes = 0; + } + + max_pkt_ack = 0; + if (msg_probe(msg)) { + l_ptr->stats.recv_probes++; + if (msg_size(msg) > sizeof(l_ptr->proto_msg)) { + max_pkt_ack = msg_size(msg); + } + } + + /* Protocol message before retransmits, reduce loss risk */ + + bclink_check_gap(l_ptr->owner, msg_last_bcast(msg)); + + if (rec_gap || (msg_probe(msg))) { + link_send_proto_msg(l_ptr, STATE_MSG, + 0, rec_gap, 0, 0, max_pkt_ack); + } + if (msg_seq_gap(msg)) { + msg_dbg(msg, "With Gap:"); + l_ptr->stats.recv_nacks++; + link_retransmit(l_ptr, l_ptr->first_out, + msg_seq_gap(msg)); + } + break; + default: + msg_dbg(buf_msg(buf), "<DISCARDING UNKNOWN<"); + } +exit: + buf_discard(buf); +} + + +/* + * link_tunnel(): Send one message via a link belonging to + * another bearer. Owner node is locked. + */ +void link_tunnel(struct link *l_ptr, + struct tipc_msg *tunnel_hdr, + struct tipc_msg *msg, + u32 selector) +{ + struct link *tunnel; + struct sk_buff *buf; + u32 length = msg_size(msg); + + tunnel = l_ptr->owner->active_links[selector & 1]; + if (!link_is_up(tunnel)) + return; + msg_set_size(tunnel_hdr, length + INT_H_SIZE); + buf = buf_acquire(length + INT_H_SIZE); + if (!buf) + return; + memcpy(buf->data, (unchar *)tunnel_hdr, INT_H_SIZE); + memcpy(buf->data + INT_H_SIZE, (unchar *)msg, length); + dbg("%c->%c:", l_ptr->b_ptr->net_plane, tunnel->b_ptr->net_plane); + msg_dbg(buf_msg(buf), ">SEND>"); + assert(tunnel); + link_send_buf(tunnel, buf); +} + + + +/* + * changeover(): Send whole message queue via the remaining link + * Owner node is locked. + */ + +void link_changeover(struct link *l_ptr) +{ + u32 msgcount = l_ptr->out_queue_size; + struct sk_buff *crs = l_ptr->first_out; + struct link *tunnel = l_ptr->owner->active_links[0]; + int split_bundles = node_has_redundant_links(l_ptr->owner); + struct tipc_msg tunnel_hdr; + + if (!tunnel) + return; + + if (!l_ptr->owner->permit_changeover) + return; + + msg_init(&tunnel_hdr, CHANGEOVER_PROTOCOL, + ORIGINAL_MSG, TIPC_OK, INT_H_SIZE, l_ptr->addr); + msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); + msg_set_msgcnt(&tunnel_hdr, msgcount); + if (!l_ptr->first_out) { + struct sk_buff *buf; + + assert(!msgcount); + buf = buf_acquire(INT_H_SIZE); + if (buf) { + memcpy(buf->data, (unchar *)&tunnel_hdr, INT_H_SIZE); + msg_set_size(&tunnel_hdr, INT_H_SIZE); + dbg("%c->%c:", l_ptr->b_ptr->net_plane, + tunnel->b_ptr->net_plane); + msg_dbg(&tunnel_hdr, "EMPTY>SEND>"); + link_send_buf(tunnel, buf); + } else { + warn("Memory squeeze; link changeover failed\n"); + } + return; + } + while (crs) { + struct tipc_msg *msg = buf_msg(crs); + + if ((msg_user(msg) == MSG_BUNDLER) && split_bundles) { + u32 msgcount = msg_msgcnt(msg); + struct tipc_msg *m = msg_get_wrapped(msg); + unchar* pos = (unchar*)m; + + while (msgcount--) { + msg_set_seqno(m,msg_seqno(msg)); + link_tunnel(l_ptr, &tunnel_hdr, m, + msg_link_selector(m)); + pos += align(msg_size(m)); + m = (struct tipc_msg *)pos; + } + } else { + link_tunnel(l_ptr, &tunnel_hdr, msg, + msg_link_selector(msg)); + } + crs = crs->next; + } +} + +void link_send_duplicate(struct link *l_ptr, struct link *tunnel) +{ + struct sk_buff *iter; + struct tipc_msg tunnel_hdr; + + msg_init(&tunnel_hdr, CHANGEOVER_PROTOCOL, + DUPLICATE_MSG, TIPC_OK, INT_H_SIZE, l_ptr->addr); + msg_set_msgcnt(&tunnel_hdr, l_ptr->out_queue_size); + msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); + iter = l_ptr->first_out; + while (iter) { + struct sk_buff *outbuf; + struct tipc_msg *msg = buf_msg(iter); + u32 length = msg_size(msg); + + if (msg_user(msg) == MSG_BUNDLER) + msg_set_type(msg, CLOSED_MSG); + msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); /* Update */ + msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); + msg_set_size(&tunnel_hdr, length + INT_H_SIZE); + outbuf = buf_acquire(length + INT_H_SIZE); + if (outbuf == NULL) { + warn("Memory squeeze; buffer duplication failed\n"); + return; + } + memcpy(outbuf->data, (unchar *)&tunnel_hdr, INT_H_SIZE); + memcpy(outbuf->data + INT_H_SIZE, iter->data, length); + dbg("%c->%c:", l_ptr->b_ptr->net_plane, + tunnel->b_ptr->net_plane); + msg_dbg(buf_msg(outbuf), ">SEND>"); + link_send_buf(tunnel, outbuf); + if (!link_is_up(l_ptr)) + return; + iter = iter->next; + } +} + + + +/** + * buf_extract - extracts embedded TIPC message from another message + * @skb: encapsulating message buffer + * @from_pos: offset to extract from + * + * Returns a new message buffer containing an embedded message. The + * encapsulating message itself is left unchanged. + */ + +static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos) +{ + struct tipc_msg *msg = (struct tipc_msg *)(skb->data + from_pos); + u32 size = msg_size(msg); + struct sk_buff *eb; + + eb = buf_acquire(size); + if (eb) + memcpy(eb->data, (unchar *)msg, size); + return eb; +} + +/* + * link_recv_changeover_msg(): Receive tunneled packet sent + * via other link. Node is locked. Return extracted buffer. + */ + +static int link_recv_changeover_msg(struct link **l_ptr, + struct sk_buff **buf) +{ + struct sk_buff *tunnel_buf = *buf; + struct link *dest_link; + struct tipc_msg *msg; + struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf); + u32 msg_typ = msg_type(tunnel_msg); + u32 msg_count = msg_msgcnt(tunnel_msg); + + dest_link = (*l_ptr)->owner->links[msg_bearer_id(tunnel_msg)]; + assert(dest_link != *l_ptr); + if (!dest_link) { + msg_dbg(tunnel_msg, "NOLINK/<REC<"); + goto exit; + } + dbg("%c<-%c:", dest_link->b_ptr->net_plane, + (*l_ptr)->b_ptr->net_plane); + *l_ptr = dest_link; + msg = msg_get_wrapped(tunnel_msg); + + if (msg_typ == DUPLICATE_MSG) { + if (less(msg_seqno(msg), mod(dest_link->next_in_no))) { + msg_dbg(tunnel_msg, "DROP/<REC<"); + goto exit; + } + *buf = buf_extract(tunnel_buf,INT_H_SIZE); + if (*buf == NULL) { + warn("Memory squeeze; failed to extract msg\n"); + goto exit; + } + msg_dbg(tunnel_msg, "TNL<REC<"); + buf_discard(tunnel_buf); + return 1; + } + + /* First original message ?: */ + + if (link_is_up(dest_link)) { + msg_dbg(tunnel_msg, "UP/FIRST/<REC<"); + link_reset(dest_link); + dest_link->exp_msg_count = msg_count; + if (!msg_count) + goto exit; + } else if (dest_link->exp_msg_count == START_CHANGEOVER) { + msg_dbg(tunnel_msg, "BLK/FIRST/<REC<"); + dest_link->exp_msg_count = msg_count; + if (!msg_count) + goto exit; + } + + /* Receive original message */ + + if (dest_link->exp_msg_count == 0) { + msg_dbg(tunnel_msg, "OVERDUE/DROP/<REC<"); + dbg_print_link(dest_link, "LINK:"); + goto exit; + } + dest_link->exp_msg_count--; + if (less(msg_seqno(msg), dest_link->reset_checkpoint)) { + msg_dbg(tunnel_msg, "DROP/DUPL/<REC<"); + goto exit; + } else { + *buf = buf_extract(tunnel_buf, INT_H_SIZE); + if (*buf != NULL) { + msg_dbg(tunnel_msg, "TNL<REC<"); + buf_discard(tunnel_buf); + return 1; + } else { + warn("Memory squeeze; dropped incoming msg\n"); + } + } +exit: + *buf = 0; + buf_discard(tunnel_buf); + return 0; +} + +/* + * Bundler functionality: + */ +void link_recv_bundle(struct sk_buff *buf) +{ + u32 msgcount = msg_msgcnt(buf_msg(buf)); + u32 pos = INT_H_SIZE; + struct sk_buff *obuf; + + msg_dbg(buf_msg(buf), "<BNDL<: "); + while (msgcount--) { + obuf = buf_extract(buf, pos); + if (obuf == NULL) { + char addr_string[16]; + + warn("Buffer allocation failure;\n"); + warn(" incoming message(s) from %s lost\n", + addr_string_fill(addr_string, + msg_orignode(buf_msg(buf)))); + return; + }; + pos += align(msg_size(buf_msg(obuf))); + msg_dbg(buf_msg(obuf), " /"); + net_route_msg(obuf); + } + buf_discard(buf); +} + +/* + * Fragmentation/defragmentation: + */ + + +/* + * link_send_long_buf: Entry for buffers needing fragmentation. + * The buffer is complete, inclusive total message length. + * Returns user data length. + */ +int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf) +{ + struct tipc_msg *inmsg = buf_msg(buf); + struct tipc_msg fragm_hdr; + u32 insize = msg_size(inmsg); + u32 dsz = msg_data_sz(inmsg); + unchar *crs = buf->data; + u32 rest = insize; + u32 pack_sz = link_max_pkt(l_ptr); + u32 fragm_sz = pack_sz - INT_H_SIZE; + u32 fragm_no = 1; + u32 destaddr = msg_destnode(inmsg); + + if (msg_short(inmsg)) + destaddr = l_ptr->addr; + + if (msg_routed(inmsg)) + msg_set_prevnode(inmsg, tipc_own_addr); + + /* Prepare reusable fragment header: */ + + msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT, + TIPC_OK, INT_H_SIZE, destaddr); + msg_set_link_selector(&fragm_hdr, msg_link_selector(inmsg)); + msg_set_long_msgno(&fragm_hdr, mod(l_ptr->long_msg_seq_no++)); + msg_set_fragm_no(&fragm_hdr, fragm_no); + l_ptr->stats.sent_fragmented++; + + /* Chop up message: */ + + while (rest > 0) { + struct sk_buff *fragm; + + if (rest <= fragm_sz) { + fragm_sz = rest; + msg_set_type(&fragm_hdr, LAST_FRAGMENT); + } + fragm = buf_acquire(fragm_sz + INT_H_SIZE); + if (fragm == NULL) { + warn("Memory squeeze; failed to fragment msg\n"); + dsz = -ENOMEM; + goto exit; + } + msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); + memcpy(fragm->data, (unchar *)&fragm_hdr, INT_H_SIZE); + memcpy(fragm->data + INT_H_SIZE, crs, fragm_sz); + + /* Send queued messages first, if any: */ + + l_ptr->stats.sent_fragments++; + link_send_buf(l_ptr, fragm); + if (!link_is_up(l_ptr)) + return dsz; + msg_set_fragm_no(&fragm_hdr, ++fragm_no); + rest -= fragm_sz; + crs += fragm_sz; + msg_set_type(&fragm_hdr, FRAGMENT); + } +exit: + buf_discard(buf); + return dsz; +} + +/* + * A pending message being re-assembled must store certain values + * to handle subsequent fragments correctly. The following functions + * help storing these values in unused, available fields in the + * pending message. This makes dynamic memory allocation unecessary. + */ + +static inline u32 get_long_msg_seqno(struct sk_buff *buf) +{ + return msg_seqno(buf_msg(buf)); +} + +static inline void set_long_msg_seqno(struct sk_buff *buf, u32 seqno) +{ + msg_set_seqno(buf_msg(buf), seqno); +} + +static inline u32 get_fragm_size(struct sk_buff *buf) +{ + return msg_ack(buf_msg(buf)); +} + +static inline void set_fragm_size(struct sk_buff *buf, u32 sz) +{ + msg_set_ack(buf_msg(buf), sz); +} + +static inline u32 get_expected_frags(struct sk_buff *buf) +{ + return msg_bcast_ack(buf_msg(buf)); +} + +static inline void set_expected_frags(struct sk_buff *buf, u32 exp) +{ + msg_set_bcast_ack(buf_msg(buf), exp); +} + +static inline u32 get_timer_cnt(struct sk_buff *buf) +{ + return msg_reroute_cnt(buf_msg(buf)); +} + +static inline void incr_timer_cnt(struct sk_buff *buf) +{ + msg_incr_reroute_cnt(buf_msg(buf)); +} + +/* + * link_recv_fragment(): Called with node lock on. Returns + * the reassembled buffer if message is complete. + */ +int link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb, + struct tipc_msg **m) +{ + struct sk_buff *prev = 0; + struct sk_buff *fbuf = *fb; + struct tipc_msg *fragm = buf_msg(fbuf); + struct sk_buff *pbuf = *pending; + u32 long_msg_seq_no = msg_long_msgno(fragm); + + *fb = 0; + msg_dbg(fragm,"FRG<REC<"); + + /* Is there an incomplete message waiting for this fragment? */ + + while (pbuf && ((msg_seqno(buf_msg(pbuf)) != long_msg_seq_no) + || (msg_orignode(fragm) != msg_orignode(buf_msg(pbuf))))) { + prev = pbuf; + pbuf = pbuf->next; + } + + if (!pbuf && (msg_type(fragm) == FIRST_FRAGMENT)) { + struct tipc_msg *imsg = (struct tipc_msg *)msg_data(fragm); + u32 msg_sz = msg_size(imsg); + u32 fragm_sz = msg_data_sz(fragm); + u32 exp_fragm_cnt = msg_sz/fragm_sz + !!(msg_sz % fragm_sz); + u32 max = TIPC_MAX_USER_MSG_SIZE + LONG_H_SIZE; + if (msg_type(imsg) == TIPC_MCAST_MSG) + max = TIPC_MAX_USER_MSG_SIZE + MCAST_H_SIZE; + if (msg_size(imsg) > max) { + msg_dbg(fragm,"<REC<Oversized: "); + buf_discard(fbuf); + return 0; + } + pbuf = buf_acquire(msg_size(imsg)); + if (pbuf != NULL) { + pbuf->next = *pending; + *pending = pbuf; + memcpy(pbuf->data, (unchar *)imsg, msg_data_sz(fragm)); + + /* Prepare buffer for subsequent fragments. */ + + set_long_msg_seqno(pbuf, long_msg_seq_no); + set_fragm_size(pbuf,fragm_sz); + set_expected_frags(pbuf,exp_fragm_cnt - 1); + } else { + warn("Memory squeeze; got no defragmenting buffer\n"); + } + buf_discard(fbuf); + return 0; + } else if (pbuf && (msg_type(fragm) != FIRST_FRAGMENT)) { + u32 dsz = msg_data_sz(fragm); + u32 fsz = get_fragm_size(pbuf); + u32 crs = ((msg_fragm_no(fragm) - 1) * fsz); + u32 exp_frags = get_expected_frags(pbuf) - 1; + memcpy(pbuf->data + crs, msg_data(fragm), dsz); + buf_discard(fbuf); + + /* Is message complete? */ + + if (exp_frags == 0) { + if (prev) + prev->next = pbuf->next; + else + *pending = pbuf->next; + msg_reset_reroute_cnt(buf_msg(pbuf)); + *fb = pbuf; + *m = buf_msg(pbuf); + return 1; + } + set_expected_frags(pbuf,exp_frags); + return 0; + } + dbg(" Discarding orphan fragment %x\n",fbuf); + msg_dbg(fragm,"ORPHAN:"); + dbg("Pending long buffers:\n"); + dbg_print_buf_chain(*pending); + buf_discard(fbuf); + return 0; +} + +/** + * link_check_defragm_bufs - flush stale incoming message fragments + * @l_ptr: pointer to link + */ + +static void link_check_defragm_bufs(struct link *l_ptr) +{ + struct sk_buff *prev = 0; + struct sk_buff *next = 0; + struct sk_buff *buf = l_ptr->defragm_buf; + + if (!buf) + return; + if (!link_working_working(l_ptr)) + return; + while (buf) { + u32 cnt = get_timer_cnt(buf); + + next = buf->next; + if (cnt < 4) { + incr_timer_cnt(buf); + prev = buf; + } else { + dbg(" Discarding incomplete long buffer\n"); + msg_dbg(buf_msg(buf), "LONG:"); + dbg_print_link(l_ptr, "curr:"); + dbg("Pending long buffers:\n"); + dbg_print_buf_chain(l_ptr->defragm_buf); + if (prev) + prev->next = buf->next; + else + l_ptr->defragm_buf = buf->next; + buf_discard(buf); + } + buf = next; + } +} + + + +static void link_set_supervision_props(struct link *l_ptr, u32 tolerance) +{ + l_ptr->tolerance = tolerance; + l_ptr->continuity_interval = + ((tolerance / 4) > 500) ? 500 : tolerance / 4; + l_ptr->abort_limit = tolerance / (l_ptr->continuity_interval / 4); +} + + +void link_set_queue_limits(struct link *l_ptr, u32 window) +{ + /* Data messages from this node, inclusive FIRST_FRAGM */ + l_ptr->queue_limit[DATA_LOW] = window; + l_ptr->queue_limit[DATA_MEDIUM] = (window / 3) * 4; + l_ptr->queue_limit[DATA_HIGH] = (window / 3) * 5; + l_ptr->queue_limit[DATA_CRITICAL] = (window / 3) * 6; + /* Transiting data messages,inclusive FIRST_FRAGM */ + l_ptr->queue_limit[DATA_LOW + 4] = 300; + l_ptr->queue_limit[DATA_MEDIUM + 4] = 600; + l_ptr->queue_limit[DATA_HIGH + 4] = 900; + l_ptr->queue_limit[DATA_CRITICAL + 4] = 1200; + l_ptr->queue_limit[CONN_MANAGER] = 1200; + l_ptr->queue_limit[ROUTE_DISTRIBUTOR] = 1200; + l_ptr->queue_limit[CHANGEOVER_PROTOCOL] = 2500; + l_ptr->queue_limit[NAME_DISTRIBUTOR] = 3000; + /* FRAGMENT and LAST_FRAGMENT packets */ + l_ptr->queue_limit[MSG_FRAGMENTER] = 4000; +} + +/** + * link_find_link - locate link by name + * @name - ptr to link name string + * @node - ptr to area to be filled with ptr to associated node + * + * Caller must hold 'net_lock' to ensure node and bearer are not deleted; + * this also prevents link deletion. + * + * Returns pointer to link (or 0 if invalid link name). + */ + +static struct link *link_find_link(const char *name, struct node **node) +{ + struct link_name link_name_parts; + struct bearer *b_ptr; + struct link *l_ptr; + + if (!link_name_validate(name, &link_name_parts)) + return 0; + + b_ptr = bearer_find_interface(link_name_parts.if_local); + if (!b_ptr) + return 0; + + *node = node_find(link_name_parts.addr_peer); + if (!*node) + return 0; + + l_ptr = (*node)->links[b_ptr->identity]; + if (!l_ptr || strcmp(l_ptr->name, name)) + return 0; + + return l_ptr; +} + +struct sk_buff *link_cmd_config(const void *req_tlv_area, int req_tlv_space, + u16 cmd) +{ + struct tipc_link_config *args; + u32 new_value; + struct link *l_ptr; + struct node *node; + int res; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_CONFIG)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + args = (struct tipc_link_config *)TLV_DATA(req_tlv_area); + new_value = ntohl(args->value); + + if (!strcmp(args->name, bc_link_name)) { + if ((cmd == TIPC_CMD_SET_LINK_WINDOW) && + (bclink_set_queue_limits(new_value) == 0)) + return cfg_reply_none(); + return cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot change setting on broadcast link)"); + } + + read_lock_bh(&net_lock); + l_ptr = link_find_link(args->name, &node); + if (!l_ptr) { + read_unlock_bh(&net_lock); + return cfg_reply_error_string("link not found"); + } + + node_lock(node); + res = -EINVAL; + switch (cmd) { + case TIPC_CMD_SET_LINK_TOL: + if ((new_value >= TIPC_MIN_LINK_TOL) && + (new_value <= TIPC_MAX_LINK_TOL)) { + link_set_supervision_props(l_ptr, new_value); + link_send_proto_msg(l_ptr, STATE_MSG, + 0, 0, new_value, 0, 0); + res = TIPC_OK; + } + break; + case TIPC_CMD_SET_LINK_PRI: + if (new_value < TIPC_NUM_LINK_PRI) { + l_ptr->priority = new_value; + link_send_proto_msg(l_ptr, STATE_MSG, + 0, 0, 0, new_value, 0); + res = TIPC_OK; + } + break; + case TIPC_CMD_SET_LINK_WINDOW: + if ((new_value >= TIPC_MIN_LINK_WIN) && + (new_value <= TIPC_MAX_LINK_WIN)) { + link_set_queue_limits(l_ptr, new_value); + res = TIPC_OK; + } + break; + } + node_unlock(node); + + read_unlock_bh(&net_lock); + if (res) + return cfg_reply_error_string("cannot change link setting"); + + return cfg_reply_none(); +} + +/** + * link_reset_statistics - reset link statistics + * @l_ptr: pointer to link + */ + +static void link_reset_statistics(struct link *l_ptr) +{ + memset(&l_ptr->stats, 0, sizeof(l_ptr->stats)); + l_ptr->stats.sent_info = l_ptr->next_out_no; + l_ptr->stats.recv_info = l_ptr->next_in_no; +} + +struct sk_buff *link_cmd_reset_stats(const void *req_tlv_area, int req_tlv_space) +{ + char *link_name; + struct link *l_ptr; + struct node *node; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_NAME)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + link_name = (char *)TLV_DATA(req_tlv_area); + if (!strcmp(link_name, bc_link_name)) { + if (bclink_reset_stats()) + return cfg_reply_error_string("link not found"); + return cfg_reply_none(); + } + + read_lock_bh(&net_lock); + l_ptr = link_find_link(link_name, &node); + if (!l_ptr) { + read_unlock_bh(&net_lock); + return cfg_reply_error_string("link not found"); + } + + node_lock(node); + link_reset_statistics(l_ptr); + node_unlock(node); + read_unlock_bh(&net_lock); + return cfg_reply_none(); +} + +/** + * percent - convert count to a percentage of total (rounding up or down) + */ + +static u32 percent(u32 count, u32 total) +{ + return (count * 100 + (total / 2)) / total; +} + +/** + * link_stats - print link statistics + * @name: link name + * @buf: print buffer area + * @buf_size: size of print buffer area + * + * Returns length of print buffer data string (or 0 if error) + */ + +static int link_stats(const char *name, char *buf, const u32 buf_size) +{ + struct print_buf pb; + struct link *l_ptr; + struct node *node; + char *status; + u32 profile_total = 0; + + if (!strcmp(name, bc_link_name)) + return bclink_stats(buf, buf_size); + + printbuf_init(&pb, buf, buf_size); + + read_lock_bh(&net_lock); + l_ptr = link_find_link(name, &node); + if (!l_ptr) { + read_unlock_bh(&net_lock); + return 0; + } + node_lock(node); + + if (link_is_active(l_ptr)) + status = "ACTIVE"; + else if (link_is_up(l_ptr)) + status = "STANDBY"; + else + status = "DEFUNCT"; + tipc_printf(&pb, "Link <%s>\n" + " %s MTU:%u Priority:%u Tolerance:%u ms" + " Window:%u packets\n", + l_ptr->name, status, link_max_pkt(l_ptr), + l_ptr->priority, l_ptr->tolerance, l_ptr->queue_limit[0]); + tipc_printf(&pb, " RX packets:%u fragments:%u/%u bundles:%u/%u\n", + l_ptr->next_in_no - l_ptr->stats.recv_info, + l_ptr->stats.recv_fragments, + l_ptr->stats.recv_fragmented, + l_ptr->stats.recv_bundles, + l_ptr->stats.recv_bundled); + tipc_printf(&pb, " TX packets:%u fragments:%u/%u bundles:%u/%u\n", + l_ptr->next_out_no - l_ptr->stats.sent_info, + l_ptr->stats.sent_fragments, + l_ptr->stats.sent_fragmented, + l_ptr->stats.sent_bundles, + l_ptr->stats.sent_bundled); + profile_total = l_ptr->stats.msg_length_counts; + if (!profile_total) + profile_total = 1; + tipc_printf(&pb, " TX profile sample:%u packets average:%u octets\n" + " 0-64:%u%% -256:%u%% -1024:%u%% -4096:%u%% " + "-16354:%u%% -32768:%u%% -66000:%u%%\n", + l_ptr->stats.msg_length_counts, + l_ptr->stats.msg_lengths_total / profile_total, + percent(l_ptr->stats.msg_length_profile[0], profile_total), + percent(l_ptr->stats.msg_length_profile[1], profile_total), + percent(l_ptr->stats.msg_length_profile[2], profile_total), + percent(l_ptr->stats.msg_length_profile[3], profile_total), + percent(l_ptr->stats.msg_length_profile[4], profile_total), + percent(l_ptr->stats.msg_length_profile[5], profile_total), + percent(l_ptr->stats.msg_length_profile[6], profile_total)); + tipc_printf(&pb, " RX states:%u probes:%u naks:%u defs:%u dups:%u\n", + l_ptr->stats.recv_states, + l_ptr->stats.recv_probes, + l_ptr->stats.recv_nacks, + l_ptr->stats.deferred_recv, + l_ptr->stats.duplicates); + tipc_printf(&pb, " TX states:%u probes:%u naks:%u acks:%u dups:%u\n", + l_ptr->stats.sent_states, + l_ptr->stats.sent_probes, + l_ptr->stats.sent_nacks, + l_ptr->stats.sent_acks, + l_ptr->stats.retransmitted); + tipc_printf(&pb, " Congestion bearer:%u link:%u Send queue max:%u avg:%u\n", + l_ptr->stats.bearer_congs, + l_ptr->stats.link_congs, + l_ptr->stats.max_queue_sz, + l_ptr->stats.queue_sz_counts + ? (l_ptr->stats.accu_queue_sz / l_ptr->stats.queue_sz_counts) + : 0); + + node_unlock(node); + read_unlock_bh(&net_lock); + return printbuf_validate(&pb); +} + +#define MAX_LINK_STATS_INFO 2000 + +struct sk_buff *link_cmd_show_stats(const void *req_tlv_area, int req_tlv_space) +{ + struct sk_buff *buf; + struct tlv_desc *rep_tlv; + int str_len; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_NAME)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + buf = cfg_reply_alloc(TLV_SPACE(MAX_LINK_STATS_INFO)); + if (!buf) + return NULL; + + rep_tlv = (struct tlv_desc *)buf->data; + + str_len = link_stats((char *)TLV_DATA(req_tlv_area), + (char *)TLV_DATA(rep_tlv), MAX_LINK_STATS_INFO); + if (!str_len) { + buf_discard(buf); + return cfg_reply_error_string("link not found"); + } + + skb_put(buf, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + + return buf; +} + +#if 0 +int link_control(const char *name, u32 op, u32 val) +{ + int res = -EINVAL; + struct link *l_ptr; + u32 bearer_id; + struct node * node; + u32 a; + + a = link_name2addr(name, &bearer_id); + read_lock_bh(&net_lock); + node = node_find(a); + if (node) { + node_lock(node); + l_ptr = node->links[bearer_id]; + if (l_ptr) { + if (op == TIPC_REMOVE_LINK) { + struct bearer *b_ptr = l_ptr->b_ptr; + spin_lock_bh(&b_ptr->publ.lock); + link_delete(l_ptr); + spin_unlock_bh(&b_ptr->publ.lock); + } + if (op == TIPC_CMD_BLOCK_LINK) { + link_reset(l_ptr); + l_ptr->blocked = 1; + } + if (op == TIPC_CMD_UNBLOCK_LINK) { + l_ptr->blocked = 0; + } + res = TIPC_OK; + } + node_unlock(node); + } + read_unlock_bh(&net_lock); + return res; +} +#endif + +/** + * link_get_max_pkt - get maximum packet size to use when sending to destination + * @dest: network address of destination node + * @selector: used to select from set of active links + * + * If no active link can be found, uses default maximum packet size. + */ + +u32 link_get_max_pkt(u32 dest, u32 selector) +{ + struct node *n_ptr; + struct link *l_ptr; + u32 res = MAX_PKT_DEFAULT; + + if (dest == tipc_own_addr) + return MAX_MSG_SIZE; + + read_lock_bh(&net_lock); + n_ptr = node_select(dest, selector); + if (n_ptr) { + node_lock(n_ptr); + l_ptr = n_ptr->active_links[selector & 1]; + if (l_ptr) + res = link_max_pkt(l_ptr); + node_unlock(n_ptr); + } + read_unlock_bh(&net_lock); + return res; +} + +#if 0 +static void link_dump_rec_queue(struct link *l_ptr) +{ + struct sk_buff *crs; + + if (!l_ptr->oldest_deferred_in) { + info("Reception queue empty\n"); + return; + } + info("Contents of Reception queue:\n"); + crs = l_ptr->oldest_deferred_in; + while (crs) { + if (crs->data == (void *)0x0000a3a3) { + info("buffer %x invalid\n", crs); + return; + } + msg_dbg(buf_msg(crs), "In rec queue: \n"); + crs = crs->next; + } +} +#endif + +static void link_dump_send_queue(struct link *l_ptr) +{ + if (l_ptr->next_out) { + info("\nContents of unsent queue:\n"); + dbg_print_buf_chain(l_ptr->next_out); + } + info("\nContents of send queue:\n"); + if (l_ptr->first_out) { + dbg_print_buf_chain(l_ptr->first_out); + } + info("Empty send queue\n"); +} + +static void link_print(struct link *l_ptr, struct print_buf *buf, + const char *str) +{ + tipc_printf(buf, str); + if (link_reset_reset(l_ptr) || link_reset_unknown(l_ptr)) + return; + tipc_printf(buf, "Link %x<%s>:", + l_ptr->addr, l_ptr->b_ptr->publ.name); + tipc_printf(buf, ": NXO(%u):", mod(l_ptr->next_out_no)); + tipc_printf(buf, "NXI(%u):", mod(l_ptr->next_in_no)); + tipc_printf(buf, "SQUE"); + if (l_ptr->first_out) { + tipc_printf(buf, "[%u..", msg_seqno(buf_msg(l_ptr->first_out))); + if (l_ptr->next_out) + tipc_printf(buf, "%u..", + msg_seqno(buf_msg(l_ptr->next_out))); + tipc_printf(buf, "%u]", + msg_seqno(buf_msg + (l_ptr->last_out)), l_ptr->out_queue_size); + if ((mod(msg_seqno(buf_msg(l_ptr->last_out)) - + msg_seqno(buf_msg(l_ptr->first_out))) + != (l_ptr->out_queue_size - 1)) + || (l_ptr->last_out->next != 0)) { + tipc_printf(buf, "\nSend queue inconsistency\n"); + tipc_printf(buf, "first_out= %x ", l_ptr->first_out); + tipc_printf(buf, "next_out= %x ", l_ptr->next_out); + tipc_printf(buf, "last_out= %x ", l_ptr->last_out); + link_dump_send_queue(l_ptr); + } + } else + tipc_printf(buf, "[]"); + tipc_printf(buf, "SQSIZ(%u)", l_ptr->out_queue_size); + if (l_ptr->oldest_deferred_in) { + u32 o = msg_seqno(buf_msg(l_ptr->oldest_deferred_in)); + u32 n = msg_seqno(buf_msg(l_ptr->newest_deferred_in)); + tipc_printf(buf, ":RQUE[%u..%u]", o, n); + if (l_ptr->deferred_inqueue_sz != mod((n + 1) - o)) { + tipc_printf(buf, ":RQSIZ(%u)", + l_ptr->deferred_inqueue_sz); + } + } + if (link_working_unknown(l_ptr)) + tipc_printf(buf, ":WU"); + if (link_reset_reset(l_ptr)) + tipc_printf(buf, ":RR"); + if (link_reset_unknown(l_ptr)) + tipc_printf(buf, ":RU"); + if (link_working_working(l_ptr)) + tipc_printf(buf, ":WW"); + tipc_printf(buf, "\n"); +} + diff --git a/net/tipc/link.h b/net/tipc/link.h new file mode 100644 index 000000000000..c2553f073757 --- /dev/null +++ b/net/tipc/link.h @@ -0,0 +1,296 @@ +/* + * net/tipc/link.h: Include file for TIPC link code + * + * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_LINK_H +#define _TIPC_LINK_H + +#include "dbg.h" +#include "msg.h" +#include "bearer.h" +#include "node.h" + +#define PUSH_FAILED 1 +#define PUSH_FINISHED 2 + +/* + * Link states + */ + +#define WORKING_WORKING 560810u +#define WORKING_UNKNOWN 560811u +#define RESET_UNKNOWN 560812u +#define RESET_RESET 560813u + +/* + * Starting value for maximum packet size negotiation on unicast links + * (unless bearer MTU is less) + */ + +#define MAX_PKT_DEFAULT 1500 + +/** + * struct link - TIPC link data structure + * @addr: network address of link's peer node + * @name: link name character string + * @media_addr: media address to use when sending messages over link + * @timer: link timer + * @owner: pointer to peer node + * @link_list: adjacent links in bearer's list of links + * @started: indicates if link has been started + * @checkpoint: reference point for triggering link continuity checking + * @peer_session: link session # being used by peer end of link + * @peer_bearer_id: bearer id used by link's peer endpoint + * @b_ptr: pointer to bearer used by link + * @tolerance: minimum link continuity loss needed to reset link [in ms] + * @continuity_interval: link continuity testing interval [in ms] + * @abort_limit: # of unacknowledged continuity probes needed to reset link + * @state: current state of link FSM + * @blocked: indicates if link has been administratively blocked + * @fsm_msg_cnt: # of protocol messages link FSM has sent in current state + * @proto_msg: template for control messages generated by link + * @pmsg: convenience pointer to "proto_msg" field + * @priority: current link priority + * @queue_limit: outbound message queue congestion thresholds (indexed by user) + * @exp_msg_count: # of tunnelled messages expected during link changeover + * @reset_checkpoint: seq # of last acknowledged message at time of link reset + * @max_pkt: current maximum packet size for this link + * @max_pkt_target: desired maximum packet size for this link + * @max_pkt_probes: # of probes based on current (max_pkt, max_pkt_target) + * @out_queue_size: # of messages in outbound message queue + * @first_out: ptr to first outbound message in queue + * @last_out: ptr to last outbound message in queue + * @next_out_no: next sequence number to use for outbound messages + * @last_retransmitted: sequence number of most recently retransmitted message + * @stale_count: # of identical retransmit requests made by peer + * @next_in_no: next sequence number to expect for inbound messages + * @deferred_inqueue_sz: # of messages in inbound message queue + * @oldest_deferred_in: ptr to first inbound message in queue + * @newest_deferred_in: ptr to last inbound message in queue + * @unacked_window: # of inbound messages rx'd without ack'ing back to peer + * @proto_msg_queue: ptr to (single) outbound control message + * @retransm_queue_size: number of messages to retransmit + * @retransm_queue_head: sequence number of first message to retransmit + * @next_out: ptr to first unsent outbound message in queue + * @waiting_ports: linked list of ports waiting for link congestion to abate + * @long_msg_seq_no: next identifier to use for outbound fragmented messages + * @defragm_buf: list of partially reassembled inbound message fragments + * @stats: collects statistics regarding link activity + * @print_buf: print buffer used to log link activity + */ + +struct link { + u32 addr; + char name[TIPC_MAX_LINK_NAME]; + struct tipc_media_addr media_addr; + struct timer_list timer; + struct node *owner; + struct list_head link_list; + + /* Management and link supervision data */ + int started; + u32 checkpoint; + u32 peer_session; + u32 peer_bearer_id; + struct bearer *b_ptr; + u32 tolerance; + u32 continuity_interval; + u32 abort_limit; + int state; + int blocked; + u32 fsm_msg_cnt; + struct { + unchar hdr[INT_H_SIZE]; + unchar body[TIPC_MAX_IF_NAME]; + } proto_msg; + struct tipc_msg *pmsg; + u32 priority; + u32 queue_limit[15]; /* queue_limit[0]==window limit */ + + /* Changeover */ + u32 exp_msg_count; + u32 reset_checkpoint; + + /* Max packet negotiation */ + u32 max_pkt; + u32 max_pkt_target; + u32 max_pkt_probes; + + /* Sending */ + u32 out_queue_size; + struct sk_buff *first_out; + struct sk_buff *last_out; + u32 next_out_no; + u32 last_retransmitted; + u32 stale_count; + + /* Reception */ + u32 next_in_no; + u32 deferred_inqueue_sz; + struct sk_buff *oldest_deferred_in; + struct sk_buff *newest_deferred_in; + u32 unacked_window; + + /* Congestion handling */ + struct sk_buff *proto_msg_queue; + u32 retransm_queue_size; + u32 retransm_queue_head; + struct sk_buff *next_out; + struct list_head waiting_ports; + + /* Fragmentation/defragmentation */ + u32 long_msg_seq_no; + struct sk_buff *defragm_buf; + + /* Statistics */ + struct { + u32 sent_info; /* used in counting # sent packets */ + u32 recv_info; /* used in counting # recv'd packets */ + u32 sent_states; + u32 recv_states; + u32 sent_probes; + u32 recv_probes; + u32 sent_nacks; + u32 recv_nacks; + u32 sent_acks; + u32 sent_bundled; + u32 sent_bundles; + u32 recv_bundled; + u32 recv_bundles; + u32 retransmitted; + u32 sent_fragmented; + u32 sent_fragments; + u32 recv_fragmented; + u32 recv_fragments; + u32 link_congs; /* # port sends blocked by congestion */ + u32 bearer_congs; + u32 deferred_recv; + u32 duplicates; + + /* for statistical profiling of send queue size */ + + u32 max_queue_sz; + u32 accu_queue_sz; + u32 queue_sz_counts; + + /* for statistical profiling of message lengths */ + + u32 msg_length_counts; + u32 msg_lengths_total; + u32 msg_length_profile[7]; +#if 0 + u32 sent_tunneled; + u32 recv_tunneled; +#endif + } stats; + + struct print_buf print_buf; +}; + +struct port; + +struct link *link_create(struct bearer *b_ptr, const u32 peer, + const struct tipc_media_addr *media_addr); +void link_delete(struct link *l_ptr); +void link_changeover(struct link *l_ptr); +void link_send_duplicate(struct link *l_ptr, struct link *dest); +void link_reset_fragments(struct link *l_ptr); +int link_is_up(struct link *l_ptr); +int link_is_active(struct link *l_ptr); +void link_start(struct link *l_ptr); +u32 link_push_packet(struct link *l_ptr); +void link_stop(struct link *l_ptr); +struct sk_buff *link_cmd_config(const void *req_tlv_area, int req_tlv_space, u16 cmd); +struct sk_buff *link_cmd_show_stats(const void *req_tlv_area, int req_tlv_space); +struct sk_buff *link_cmd_reset_stats(const void *req_tlv_area, int req_tlv_space); +void link_reset(struct link *l_ptr); +int link_send(struct sk_buff *buf, u32 dest, u32 selector); +int link_send_buf(struct link *l_ptr, struct sk_buff *buf); +u32 link_get_max_pkt(u32 dest,u32 selector); +int link_send_sections_fast(struct port* sender, + struct iovec const *msg_sect, + const u32 num_sect, + u32 destnode); + +int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf); +void link_tunnel(struct link *l_ptr, struct tipc_msg *tnl_hdr, + struct tipc_msg *msg, u32 selector); +void link_recv_bundle(struct sk_buff *buf); +int link_recv_fragment(struct sk_buff **pending, + struct sk_buff **fb, + struct tipc_msg **msg); +void link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int prob, u32 gap, + u32 tolerance, u32 priority, u32 acked_mtu); +void link_push_queue(struct link *l_ptr); +u32 link_defer_pkt(struct sk_buff **head, struct sk_buff **tail, + struct sk_buff *buf); +void link_wakeup_ports(struct link *l_ptr, int all); +void link_set_queue_limits(struct link *l_ptr, u32 window); +void link_retransmit(struct link *l_ptr, struct sk_buff *start, u32 retransmits); + +/* + * Link sequence number manipulation routines (uses modulo 2**16 arithmetic) + */ + +static inline u32 mod(u32 x) +{ + return x & 0xffffu; +} + +static inline int between(u32 lower, u32 upper, u32 n) +{ + if ((lower < n) && (n < upper)) + return 1; + if ((upper < lower) && ((n > lower) || (n < upper))) + return 1; + return 0; +} + +static inline int less_eq(u32 left, u32 right) +{ + return (mod(right - left) < 32768u); +} + +static inline int less(u32 left, u32 right) +{ + return (less_eq(left, right) && (mod(right) != mod(left))); +} + +static inline u32 lesser(u32 left, u32 right) +{ + return less_eq(left, right) ? left : right; +} + +#endif diff --git a/net/tipc/msg.c b/net/tipc/msg.c new file mode 100644 index 000000000000..03dbc55cb04c --- /dev/null +++ b/net/tipc/msg.c @@ -0,0 +1,334 @@ +/* + * net/tipc/msg.c: TIPC message header routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "addr.h" +#include "dbg.h" +#include "msg.h" +#include "bearer.h" + + +void msg_set_media_addr(struct tipc_msg *m, struct tipc_media_addr *a) +{ + memcpy(&((int *)m)[5], a, sizeof(*a)); +} + +void msg_get_media_addr(struct tipc_msg *m, struct tipc_media_addr *a) +{ + memcpy(a, &((int*)m)[5], sizeof(*a)); +} + + +void msg_print(struct print_buf *buf, struct tipc_msg *msg, const char *str) +{ + u32 usr = msg_user(msg); + tipc_printf(buf, str); + + switch (usr) { + case MSG_BUNDLER: + tipc_printf(buf, "BNDL::"); + tipc_printf(buf, "MSGS(%u):", msg_msgcnt(msg)); + break; + case BCAST_PROTOCOL: + tipc_printf(buf, "BCASTP::"); + break; + case MSG_FRAGMENTER: + tipc_printf(buf, "FRAGM::"); + switch (msg_type(msg)) { + case FIRST_FRAGMENT: + tipc_printf(buf, "FIRST:"); + break; + case FRAGMENT: + tipc_printf(buf, "BODY:"); + break; + case LAST_FRAGMENT: + tipc_printf(buf, "LAST:"); + break; + default: + tipc_printf(buf, "UNKNOWN:%x",msg_type(msg)); + + } + tipc_printf(buf, "NO(%u/%u):",msg_long_msgno(msg), + msg_fragm_no(msg)); + break; + case DATA_LOW: + case DATA_MEDIUM: + case DATA_HIGH: + case DATA_CRITICAL: + tipc_printf(buf, "DAT%u:", msg_user(msg)); + if (msg_short(msg)) { + tipc_printf(buf, "CON:"); + break; + } + switch (msg_type(msg)) { + case TIPC_CONN_MSG: + tipc_printf(buf, "CON:"); + break; + case TIPC_MCAST_MSG: + tipc_printf(buf, "MCST:"); + break; + case TIPC_NAMED_MSG: + tipc_printf(buf, "NAM:"); + break; + case TIPC_DIRECT_MSG: + tipc_printf(buf, "DIR:"); + break; + default: + tipc_printf(buf, "UNKNOWN TYPE %u",msg_type(msg)); + } + if (msg_routed(msg) && !msg_non_seq(msg)) + tipc_printf(buf, "ROUT:"); + if (msg_reroute_cnt(msg)) + tipc_printf(buf, "REROUTED(%u):", + msg_reroute_cnt(msg)); + break; + case NAME_DISTRIBUTOR: + tipc_printf(buf, "NMD::"); + switch (msg_type(msg)) { + case PUBLICATION: + tipc_printf(buf, "PUBL(%u):", (msg_size(msg) - msg_hdr_sz(msg)) / 20); /* Items */ + break; + case WITHDRAWAL: + tipc_printf(buf, "WDRW:"); + break; + default: + tipc_printf(buf, "UNKNOWN:%x",msg_type(msg)); + } + if (msg_routed(msg)) + tipc_printf(buf, "ROUT:"); + if (msg_reroute_cnt(msg)) + tipc_printf(buf, "REROUTED(%u):", + msg_reroute_cnt(msg)); + break; + case CONN_MANAGER: + tipc_printf(buf, "CONN_MNG:"); + switch (msg_type(msg)) { + case CONN_PROBE: + tipc_printf(buf, "PROBE:"); + break; + case CONN_PROBE_REPLY: + tipc_printf(buf, "PROBE_REPLY:"); + break; + case CONN_ACK: + tipc_printf(buf, "CONN_ACK:"); + tipc_printf(buf, "ACK(%u):",msg_msgcnt(msg)); + break; + default: + tipc_printf(buf, "UNKNOWN TYPE:%x",msg_type(msg)); + } + if (msg_routed(msg)) + tipc_printf(buf, "ROUT:"); + if (msg_reroute_cnt(msg)) + tipc_printf(buf, "REROUTED(%u):",msg_reroute_cnt(msg)); + break; + case LINK_PROTOCOL: + tipc_printf(buf, "PROT:TIM(%u):",msg_timestamp(msg)); + switch (msg_type(msg)) { + case STATE_MSG: + tipc_printf(buf, "STATE:"); + tipc_printf(buf, "%s:",msg_probe(msg) ? "PRB" :""); + tipc_printf(buf, "NXS(%u):",msg_next_sent(msg)); + tipc_printf(buf, "GAP(%u):",msg_seq_gap(msg)); + tipc_printf(buf, "LSTBC(%u):",msg_last_bcast(msg)); + break; + case RESET_MSG: + tipc_printf(buf, "RESET:"); + if (msg_size(msg) != msg_hdr_sz(msg)) + tipc_printf(buf, "BEAR:%s:",msg_data(msg)); + break; + case ACTIVATE_MSG: + tipc_printf(buf, "ACTIVATE:"); + break; + default: + tipc_printf(buf, "UNKNOWN TYPE:%x",msg_type(msg)); + } + tipc_printf(buf, "PLANE(%c):",msg_net_plane(msg)); + tipc_printf(buf, "SESS(%u):",msg_session(msg)); + break; + case CHANGEOVER_PROTOCOL: + tipc_printf(buf, "TUNL:"); + switch (msg_type(msg)) { + case DUPLICATE_MSG: + tipc_printf(buf, "DUPL:"); + break; + case ORIGINAL_MSG: + tipc_printf(buf, "ORIG:"); + tipc_printf(buf, "EXP(%u)",msg_msgcnt(msg)); + break; + default: + tipc_printf(buf, "UNKNOWN TYPE:%x",msg_type(msg)); + } + break; + case ROUTE_DISTRIBUTOR: + tipc_printf(buf, "ROUTING_MNG:"); + switch (msg_type(msg)) { + case EXT_ROUTING_TABLE: + tipc_printf(buf, "EXT_TBL:"); + tipc_printf(buf, "TO:%x:",msg_remote_node(msg)); + break; + case LOCAL_ROUTING_TABLE: + tipc_printf(buf, "LOCAL_TBL:"); + tipc_printf(buf, "TO:%x:",msg_remote_node(msg)); + break; + case SLAVE_ROUTING_TABLE: + tipc_printf(buf, "DP_TBL:"); + tipc_printf(buf, "TO:%x:",msg_remote_node(msg)); + break; + case ROUTE_ADDITION: + tipc_printf(buf, "ADD:"); + tipc_printf(buf, "TO:%x:",msg_remote_node(msg)); + break; + case ROUTE_REMOVAL: + tipc_printf(buf, "REMOVE:"); + tipc_printf(buf, "TO:%x:",msg_remote_node(msg)); + break; + default: + tipc_printf(buf, "UNKNOWN TYPE:%x",msg_type(msg)); + } + break; + case LINK_CONFIG: + tipc_printf(buf, "CFG:"); + switch (msg_type(msg)) { + case DSC_REQ_MSG: + tipc_printf(buf, "DSC_REQ:"); + break; + case DSC_RESP_MSG: + tipc_printf(buf, "DSC_RESP:"); + break; + default: + tipc_printf(buf, "UNKNOWN TYPE:%x:",msg_type(msg)); + break; + } + break; + default: + tipc_printf(buf, "UNKNOWN USER:"); + } + + switch (usr) { + case CONN_MANAGER: + case NAME_DISTRIBUTOR: + case DATA_LOW: + case DATA_MEDIUM: + case DATA_HIGH: + case DATA_CRITICAL: + if (msg_short(msg)) + break; /* No error */ + switch (msg_errcode(msg)) { + case TIPC_OK: + break; + case TIPC_ERR_NO_NAME: + tipc_printf(buf, "NO_NAME:"); + break; + case TIPC_ERR_NO_PORT: + tipc_printf(buf, "NO_PORT:"); + break; + case TIPC_ERR_NO_NODE: + tipc_printf(buf, "NO_PROC:"); + break; + case TIPC_ERR_OVERLOAD: + tipc_printf(buf, "OVERLOAD:"); + break; + case TIPC_CONN_SHUTDOWN: + tipc_printf(buf, "SHUTDOWN:"); + break; + default: + tipc_printf(buf, "UNKNOWN ERROR(%x):", + msg_errcode(msg)); + } + default:{} + } + + tipc_printf(buf, "HZ(%u):", msg_hdr_sz(msg)); + tipc_printf(buf, "SZ(%u):", msg_size(msg)); + tipc_printf(buf, "SQNO(%u):", msg_seqno(msg)); + + if (msg_non_seq(msg)) + tipc_printf(buf, "NOSEQ:"); + else { + tipc_printf(buf, "ACK(%u):", msg_ack(msg)); + } + tipc_printf(buf, "BACK(%u):", msg_bcast_ack(msg)); + tipc_printf(buf, "PRND(%x)", msg_prevnode(msg)); + + if (msg_isdata(msg)) { + if (msg_named(msg)) { + tipc_printf(buf, "NTYP(%u):", msg_nametype(msg)); + tipc_printf(buf, "NINST(%u)", msg_nameinst(msg)); + } + } + + if ((usr != LINK_PROTOCOL) && (usr != LINK_CONFIG) && + (usr != MSG_BUNDLER)) { + if (!msg_short(msg)) { + tipc_printf(buf, ":ORIG(%x:%u):", + msg_orignode(msg), msg_origport(msg)); + tipc_printf(buf, ":DEST(%x:%u):", + msg_destnode(msg), msg_destport(msg)); + } else { + tipc_printf(buf, ":OPRT(%u):", msg_origport(msg)); + tipc_printf(buf, ":DPRT(%u):", msg_destport(msg)); + } + if (msg_routed(msg) && !msg_non_seq(msg)) + tipc_printf(buf, ":TSEQN(%u)", msg_transp_seqno(msg)); + } + if (msg_user(msg) == NAME_DISTRIBUTOR) { + tipc_printf(buf, ":ONOD(%x):", msg_orignode(msg)); + tipc_printf(buf, ":DNOD(%x):", msg_destnode(msg)); + if (msg_routed(msg)) { + tipc_printf(buf, ":CSEQN(%u)", msg_transp_seqno(msg)); + } + } + + if (msg_user(msg) == LINK_CONFIG) { + u32* raw = (u32*)msg; + struct tipc_media_addr* orig = (struct tipc_media_addr*)&raw[5]; + tipc_printf(buf, ":REQL(%u):", msg_req_links(msg)); + tipc_printf(buf, ":DDOM(%x):", msg_dest_domain(msg)); + tipc_printf(buf, ":NETID(%u):", msg_bc_netid(msg)); + media_addr_printf(buf, orig); + } + if (msg_user(msg) == BCAST_PROTOCOL) { + tipc_printf(buf, "BCNACK:AFTER(%u):", msg_bcgap_after(msg)); + tipc_printf(buf, "TO(%u):", msg_bcgap_to(msg)); + } + tipc_printf(buf, "\n"); + if ((usr == CHANGEOVER_PROTOCOL) && (msg_msgcnt(msg))) { + msg_print(buf,msg_get_wrapped(msg)," /"); + } + if ((usr == MSG_FRAGMENTER) && (msg_type(msg) == FIRST_FRAGMENT)) { + msg_print(buf,msg_get_wrapped(msg)," /"); + } +} diff --git a/net/tipc/msg.h b/net/tipc/msg.h new file mode 100644 index 000000000000..662c81862a0c --- /dev/null +++ b/net/tipc/msg.h @@ -0,0 +1,818 @@ +/* + * net/tipc/msg.h: Include file for TIPC message header routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_MSG_H +#define _TIPC_MSG_H + +#include <net/tipc/tipc_msg.h> + +#define TIPC_VERSION 2 +#define DATA_LOW TIPC_LOW_IMPORTANCE +#define DATA_MEDIUM TIPC_MEDIUM_IMPORTANCE +#define DATA_HIGH TIPC_HIGH_IMPORTANCE +#define DATA_CRITICAL TIPC_CRITICAL_IMPORTANCE +#define SHORT_H_SIZE 24 /* Connected,in cluster */ +#define DIR_MSG_H_SIZE 32 /* Directly addressed messages */ +#define CONN_MSG_H_SIZE 36 /* Routed connected msgs*/ +#define LONG_H_SIZE 40 /* Named Messages */ +#define MCAST_H_SIZE 44 /* Multicast messages */ +#define MAX_H_SIZE 60 /* Inclusive full options */ +#define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) +#define LINK_CONFIG 13 + + +/* + TIPC user data message header format, version 2 + + - Fundamental definitions available to privileged TIPC users + are located in tipc_msg.h. + - Remaining definitions available to TIPC internal users appear below. +*/ + + +static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val) +{ + m->hdr[w] = htonl(val); +} + +static inline void msg_set_bits(struct tipc_msg *m, u32 w, + u32 pos, u32 mask, u32 val) +{ + u32 word = msg_word(m,w) & ~(mask << pos); + msg_set_word(m, w, (word |= (val << pos))); +} + +/* + * Word 0 + */ + +static inline u32 msg_version(struct tipc_msg *m) +{ + return msg_bits(m, 0, 29, 7); +} + +static inline void msg_set_version(struct tipc_msg *m) +{ + msg_set_bits(m, 0, 29, 0xf, TIPC_VERSION); +} + +static inline u32 msg_user(struct tipc_msg *m) +{ + return msg_bits(m, 0, 25, 0xf); +} + +static inline u32 msg_isdata(struct tipc_msg *m) +{ + return (msg_user(m) <= DATA_CRITICAL); +} + +static inline void msg_set_user(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 0, 25, 0xf, n); +} + +static inline void msg_set_importance(struct tipc_msg *m, u32 i) +{ + msg_set_user(m, i); +} + +static inline void msg_set_hdr_sz(struct tipc_msg *m,u32 n) +{ + msg_set_bits(m, 0, 21, 0xf, n>>2); +} + +static inline int msg_non_seq(struct tipc_msg *m) +{ + return msg_bits(m, 0, 20, 1); +} + +static inline void msg_set_non_seq(struct tipc_msg *m) +{ + msg_set_bits(m, 0, 20, 1, 1); +} + +static inline int msg_dest_droppable(struct tipc_msg *m) +{ + return msg_bits(m, 0, 19, 1); +} + +static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 19, 1, d); +} + +static inline int msg_src_droppable(struct tipc_msg *m) +{ + return msg_bits(m, 0, 18, 1); +} + +static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) +{ + msg_set_bits(m, 0, 18, 1, d); +} + +static inline void msg_set_size(struct tipc_msg *m, u32 sz) +{ + m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); +} + + +/* + * Word 1 + */ + +static inline void msg_set_type(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 1, 29, 0x7, n); +} + +static inline void msg_set_errcode(struct tipc_msg *m, u32 err) +{ + msg_set_bits(m, 1, 25, 0xf, err); +} + +static inline u32 msg_reroute_cnt(struct tipc_msg *m) +{ + return msg_bits(m, 1, 21, 0xf); +} + +static inline void msg_incr_reroute_cnt(struct tipc_msg *m) +{ + msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1); +} + +static inline void msg_reset_reroute_cnt(struct tipc_msg *m) +{ + msg_set_bits(m, 1, 21, 0xf, 0); +} + +static inline u32 msg_lookup_scope(struct tipc_msg *m) +{ + return msg_bits(m, 1, 19, 0x3); +} + +static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 1, 19, 0x3, n); +} + +static inline void msg_set_options(struct tipc_msg *m, const char *opt, u32 sz) +{ + u32 hsz = msg_hdr_sz(m); + char *to = (char *)&m->hdr[hsz/4]; + + if ((hsz < DIR_MSG_H_SIZE) || ((hsz + sz) > MAX_H_SIZE)) + return; + msg_set_bits(m, 1, 16, 0x7, (hsz - 28)/4); + msg_set_hdr_sz(m, hsz + sz); + memcpy(to, opt, sz); +} + +static inline u32 msg_bcast_ack(struct tipc_msg *m) +{ + return msg_bits(m, 1, 0, 0xffff); +} + +static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 1, 0, 0xffff, n); +} + + +/* + * Word 2 + */ + +static inline u32 msg_ack(struct tipc_msg *m) +{ + return msg_bits(m, 2, 16, 0xffff); +} + +static inline void msg_set_ack(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 2, 16, 0xffff, n); +} + +static inline u32 msg_seqno(struct tipc_msg *m) +{ + return msg_bits(m, 2, 0, 0xffff); +} + +static inline void msg_set_seqno(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 2, 0, 0xffff, n); +} + + +/* + * Words 3-10 + */ + + +static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) +{ + msg_set_word(m, 3, a); +} + +static inline void msg_set_origport(struct tipc_msg *m, u32 p) +{ + msg_set_word(m, 4, p); +} + +static inline void msg_set_destport(struct tipc_msg *m, u32 p) +{ + msg_set_word(m, 5, p); +} + +static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p) +{ + msg_set_word(m, 5, p); +} + +static inline void msg_set_orignode(struct tipc_msg *m, u32 a) +{ + msg_set_word(m, 6, a); +} + +static inline void msg_set_destnode(struct tipc_msg *m, u32 a) +{ + msg_set_word(m, 7, a); +} + +static inline int msg_is_dest(struct tipc_msg *m, u32 d) +{ + return(msg_short(m) || (msg_destnode(m) == d)); +} + +static inline u32 msg_routed(struct tipc_msg *m) +{ + if (likely(msg_short(m))) + return 0; + return(msg_destnode(m) ^ msg_orignode(m)) >> 11; +} + +static inline void msg_set_nametype(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 8, n); +} + +static inline u32 msg_transp_seqno(struct tipc_msg *m) +{ + return msg_word(m, 8); +} + +static inline void msg_set_timestamp(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 8, n); +} + +static inline u32 msg_timestamp(struct tipc_msg *m) +{ + return msg_word(m, 8); +} + +static inline void msg_set_transp_seqno(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 8, n); +} + +static inline void msg_set_namelower(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 9, n); +} + +static inline void msg_set_nameinst(struct tipc_msg *m, u32 n) +{ + msg_set_namelower(m, n); +} + +static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 10, n); +} + +static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) +{ + return (struct tipc_msg *)msg_data(m); +} + +static inline void msg_expand(struct tipc_msg *m, u32 destnode) +{ + if (!msg_short(m)) + return; + msg_set_hdr_sz(m, LONG_H_SIZE); + msg_set_orignode(m, msg_prevnode(m)); + msg_set_destnode(m, destnode); + memset(&m->hdr[8], 0, 12); +} + + + +/* + TIPC internal message header format, version 2 + + 1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + w0:|vers |msg usr|hdr sz |n|resrv| packet size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + w1:|m typ|rsv=0| sequence gap | broadcast ack no | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + w2:| link level ack no/bc_gap_from | seq no / bcast_gap_to | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + w3:| previous node | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + w4:| next sent broadcast/fragm no | next sent pkt/ fragm msg no | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + w5:| session no |rsv=0|r|berid|link prio|netpl|p| + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + w6:| originating node | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + w7:| destination node | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + w8:| transport sequence number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + w9:| msg count / bcast tag | link tolerance | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + \ \ + / User Specific Data / + \ \ + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + NB: CONN_MANAGER use data message format. LINK_CONFIG has own format. +*/ + +/* + * Internal users + */ + +#define BCAST_PROTOCOL 5 +#define MSG_BUNDLER 6 +#define LINK_PROTOCOL 7 +#define CONN_MANAGER 8 +#define ROUTE_DISTRIBUTOR 9 +#define CHANGEOVER_PROTOCOL 10 +#define NAME_DISTRIBUTOR 11 +#define MSG_FRAGMENTER 12 +#define LINK_CONFIG 13 +#define INT_H_SIZE 40 +#define DSC_H_SIZE 40 + +/* + * Connection management protocol messages + */ + +#define CONN_PROBE 0 +#define CONN_PROBE_REPLY 1 +#define CONN_ACK 2 + +/* + * Name distributor messages + */ + +#define PUBLICATION 0 +#define WITHDRAWAL 1 + + +/* + * Word 1 + */ + +static inline u32 msg_seq_gap(struct tipc_msg *m) +{ + return msg_bits(m, 1, 16, 0xff); +} + +static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 1, 16, 0xff, n); +} + +static inline u32 msg_req_links(struct tipc_msg *m) +{ + return msg_bits(m, 1, 16, 0xfff); +} + +static inline void msg_set_req_links(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 1, 16, 0xfff, n); +} + + +/* + * Word 2 + */ + +static inline u32 msg_dest_domain(struct tipc_msg *m) +{ + return msg_word(m, 2); +} + +static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 2, n); +} + +static inline u32 msg_bcgap_after(struct tipc_msg *m) +{ + return msg_bits(m, 2, 16, 0xffff); +} + +static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 2, 16, 0xffff, n); +} + +static inline u32 msg_bcgap_to(struct tipc_msg *m) +{ + return msg_bits(m, 2, 0, 0xffff); +} + +static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 2, 0, 0xffff, n); +} + + +/* + * Word 4 + */ + +static inline u32 msg_last_bcast(struct tipc_msg *m) +{ + return msg_bits(m, 4, 16, 0xffff); +} + +static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 4, 16, 0xffff, n); +} + + +static inline u32 msg_fragm_no(struct tipc_msg *m) +{ + return msg_bits(m, 4, 16, 0xffff); +} + +static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 4, 16, 0xffff, n); +} + + +static inline u32 msg_next_sent(struct tipc_msg *m) +{ + return msg_bits(m, 4, 0, 0xffff); +} + +static inline void msg_set_next_sent(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 4, 0, 0xffff, n); +} + + +static inline u32 msg_long_msgno(struct tipc_msg *m) +{ + return msg_bits(m, 4, 0, 0xffff); +} + +static inline void msg_set_long_msgno(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 4, 0, 0xffff, n); +} + +static inline u32 msg_bc_netid(struct tipc_msg *m) +{ + return msg_word(m, 4); +} + +static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id) +{ + msg_set_word(m, 4, id); +} + +static inline u32 msg_link_selector(struct tipc_msg *m) +{ + return msg_bits(m, 4, 0, 1); +} + +static inline void msg_set_link_selector(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 4, 0, 1, (n & 1)); +} + +/* + * Word 5 + */ + +static inline u32 msg_session(struct tipc_msg *m) +{ + return msg_bits(m, 5, 16, 0xffff); +} + +static inline void msg_set_session(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 5, 16, 0xffff, n); +} + +static inline u32 msg_probe(struct tipc_msg *m) +{ + return msg_bits(m, 5, 0, 1); +} + +static inline void msg_set_probe(struct tipc_msg *m, u32 val) +{ + msg_set_bits(m, 5, 0, 1, (val & 1)); +} + +static inline char msg_net_plane(struct tipc_msg *m) +{ + return msg_bits(m, 5, 1, 7) + 'A'; +} + +static inline void msg_set_net_plane(struct tipc_msg *m, char n) +{ + msg_set_bits(m, 5, 1, 7, (n - 'A')); +} + +static inline u32 msg_linkprio(struct tipc_msg *m) +{ + return msg_bits(m, 5, 4, 0x1f); +} + +static inline void msg_set_linkprio(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 5, 4, 0x1f, n); +} + +static inline u32 msg_bearer_id(struct tipc_msg *m) +{ + return msg_bits(m, 5, 9, 0x7); +} + +static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 5, 9, 0x7, n); +} + +static inline u32 msg_redundant_link(struct tipc_msg *m) +{ + return msg_bits(m, 5, 12, 0x1); +} + +static inline void msg_set_redundant_link(struct tipc_msg *m) +{ + msg_set_bits(m, 5, 12, 0x1, 1); +} + +static inline void msg_clear_redundant_link(struct tipc_msg *m) +{ + msg_set_bits(m, 5, 12, 0x1, 0); +} + + +/* + * Word 9 + */ + +static inline u32 msg_msgcnt(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_msgcnt(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + +static inline u32 msg_bcast_tag(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + +static inline u32 msg_max_pkt(struct tipc_msg *m) +{ + return (msg_bits(m, 9, 16, 0xffff) * 4); +} + +static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 9, 16, 0xffff, (n / 4)); +} + +static inline u32 msg_link_tolerance(struct tipc_msg *m) +{ + return msg_bits(m, 9, 0, 0xffff); +} + +static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 9, 0, 0xffff, n); +} + +/* + * Routing table message data + */ + + +static inline u32 msg_remote_node(struct tipc_msg *m) +{ + return msg_word(m, msg_hdr_sz(m)/4); +} + +static inline void msg_set_remote_node(struct tipc_msg *m, u32 a) +{ + msg_set_word(m, msg_hdr_sz(m)/4, a); +} + +static inline int msg_dataoctet(struct tipc_msg *m, u32 pos) +{ + return(msg_data(m)[pos + 4] != 0); +} + +static inline void msg_set_dataoctet(struct tipc_msg *m, u32 pos) +{ + msg_data(m)[pos + 4] = 1; +} + +/* + * Segmentation message types + */ + +#define FIRST_FRAGMENT 0 +#define FRAGMENT 1 +#define LAST_FRAGMENT 2 + +/* + * Link management protocol message types + */ + +#define STATE_MSG 0 +#define RESET_MSG 1 +#define ACTIVATE_MSG 2 + +/* + * Changeover tunnel message types + */ +#define DUPLICATE_MSG 0 +#define ORIGINAL_MSG 1 + +/* + * Routing table message types + */ +#define EXT_ROUTING_TABLE 0 +#define LOCAL_ROUTING_TABLE 1 +#define SLAVE_ROUTING_TABLE 2 +#define ROUTE_ADDITION 3 +#define ROUTE_REMOVAL 4 + +/* + * Config protocol message types + */ + +#define DSC_REQ_MSG 0 +#define DSC_RESP_MSG 1 + +static inline u32 msg_tot_importance(struct tipc_msg *m) +{ + if (likely(msg_isdata(m))) { + if (likely(msg_orignode(m) == tipc_own_addr)) + return msg_importance(m); + return msg_importance(m) + 4; + } + if ((msg_user(m) == MSG_FRAGMENTER) && + (msg_type(m) == FIRST_FRAGMENT)) + return msg_importance(msg_get_wrapped(m)); + return msg_importance(m); +} + + +static inline void msg_init(struct tipc_msg *m, u32 user, u32 type, + u32 err, u32 hsize, u32 destnode) +{ + memset(m, 0, hsize); + msg_set_version(m); + msg_set_user(m, user); + msg_set_hdr_sz(m, hsize); + msg_set_size(m, hsize); + msg_set_prevnode(m, tipc_own_addr); + msg_set_type(m, type); + msg_set_errcode(m, err); + if (!msg_short(m)) { + msg_set_orignode(m, tipc_own_addr); + msg_set_destnode(m, destnode); + } +} + +/** + * msg_calc_data_size - determine total data size for message + */ + +static inline int msg_calc_data_size(struct iovec const *msg_sect, u32 num_sect) +{ + int dsz = 0; + int i; + + for (i = 0; i < num_sect; i++) + dsz += msg_sect[i].iov_len; + return dsz; +} + +/** + * msg_build - create message using specified header and data + * + * Note: Caller must not hold any locks in case copy_from_user() is interrupted! + * + * Returns message data size or errno + */ + +static inline int msg_build(struct tipc_msg *hdr, + struct iovec const *msg_sect, u32 num_sect, + int max_size, int usrmem, struct sk_buff** buf) +{ + int dsz, sz, hsz, pos, res, cnt; + + dsz = msg_calc_data_size(msg_sect, num_sect); + if (unlikely(dsz > TIPC_MAX_USER_MSG_SIZE)) { + *buf = NULL; + return -EINVAL; + } + + pos = hsz = msg_hdr_sz(hdr); + sz = hsz + dsz; + msg_set_size(hdr, sz); + if (unlikely(sz > max_size)) { + *buf = NULL; + return dsz; + } + + *buf = buf_acquire(sz); + if (!(*buf)) + return -ENOMEM; + memcpy((*buf)->data, (unchar *)hdr, hsz); + for (res = 1, cnt = 0; res && (cnt < num_sect); cnt++) { + if (likely(usrmem)) + res = !copy_from_user((*buf)->data + pos, + msg_sect[cnt].iov_base, + msg_sect[cnt].iov_len); + else + memcpy((*buf)->data + pos, msg_sect[cnt].iov_base, + msg_sect[cnt].iov_len); + pos += msg_sect[cnt].iov_len; + } + if (likely(res)) + return dsz; + + buf_discard(*buf); + *buf = NULL; + return -EFAULT; +} + + +struct tipc_media_addr; + +extern void msg_set_media_addr(struct tipc_msg *m, + struct tipc_media_addr *a); + +extern void msg_get_media_addr(struct tipc_msg *m, + struct tipc_media_addr *a); + + +#endif diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c new file mode 100644 index 000000000000..41cbaf1a4a73 --- /dev/null +++ b/net/tipc/name_distr.c @@ -0,0 +1,309 @@ +/* + * net/tipc/name_distr.c: TIPC name distribution code + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "cluster.h" +#include "dbg.h" +#include "link.h" +#include "msg.h" +#include "name_distr.h" + +#undef DBG_OUTPUT +#define DBG_OUTPUT NULL + +#define ITEM_SIZE sizeof(struct distr_item) + +/** + * struct distr_item - publication info distributed to other nodes + * @type: name sequence type + * @lower: name sequence lower bound + * @upper: name sequence upper bound + * @ref: publishing port reference + * @key: publication key + * + * ===> All fields are stored in network byte order. <=== + * + * First 3 fields identify (name or) name sequence being published. + * Reference field uniquely identifies port that published name sequence. + * Key field uniquely identifies publication, in the event a port has + * multiple publications of the same name sequence. + * + * Note: There is no field that identifies the publishing node because it is + * the same for all items contained within a publication message. + */ + +struct distr_item { + u32 type; + u32 lower; + u32 upper; + u32 ref; + u32 key; +}; + +/** + * List of externally visible publications by this node -- + * that is, all publications having scope > TIPC_NODE_SCOPE. + */ + +static LIST_HEAD(publ_root); +static u32 publ_cnt = 0; + +/** + * publ_to_item - add publication info to a publication message + */ + +static void publ_to_item(struct distr_item *i, struct publication *p) +{ + i->type = htonl(p->type); + i->lower = htonl(p->lower); + i->upper = htonl(p->upper); + i->ref = htonl(p->ref); + i->key = htonl(p->key); + dbg("publ_to_item: %u, %u, %u\n", p->type, p->lower, p->upper); +} + +/** + * named_prepare_buf - allocate & initialize a publication message + */ + +static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest) +{ + struct sk_buff *buf = buf_acquire(LONG_H_SIZE + size); + struct tipc_msg *msg; + + if (buf != NULL) { + msg = buf_msg(buf); + msg_init(msg, NAME_DISTRIBUTOR, type, TIPC_OK, + LONG_H_SIZE, dest); + msg_set_size(msg, LONG_H_SIZE + size); + } + return buf; +} + +/** + * named_publish - tell other nodes about a new publication by this node + */ + +void named_publish(struct publication *publ) +{ + struct sk_buff *buf; + struct distr_item *item; + + list_add(&publ->local_list, &publ_root); + publ_cnt++; + + buf = named_prepare_buf(PUBLICATION, ITEM_SIZE, 0); + if (!buf) { + warn("Memory squeeze; failed to distribute publication\n"); + return; + } + + item = (struct distr_item *)msg_data(buf_msg(buf)); + publ_to_item(item, publ); + dbg("named_withdraw: broadcasting publish msg\n"); + cluster_broadcast(buf); +} + +/** + * named_withdraw - tell other nodes about a withdrawn publication by this node + */ + +void named_withdraw(struct publication *publ) +{ + struct sk_buff *buf; + struct distr_item *item; + + list_del(&publ->local_list); + publ_cnt--; + + buf = named_prepare_buf(WITHDRAWAL, ITEM_SIZE, 0); + if (!buf) { + warn("Memory squeeze; failed to distribute withdrawal\n"); + return; + } + + item = (struct distr_item *)msg_data(buf_msg(buf)); + publ_to_item(item, publ); + dbg("named_withdraw: broadcasting withdraw msg\n"); + cluster_broadcast(buf); +} + +/** + * named_node_up - tell specified node about all publications by this node + */ + +void named_node_up(unsigned long node) +{ + struct publication *publ; + struct distr_item *item = 0; + struct sk_buff *buf = 0; + u32 left = 0; + u32 rest; + u32 max_item_buf; + + assert(in_own_cluster(node)); + read_lock_bh(&nametbl_lock); + max_item_buf = TIPC_MAX_USER_MSG_SIZE / ITEM_SIZE; + max_item_buf *= ITEM_SIZE; + rest = publ_cnt * ITEM_SIZE; + + list_for_each_entry(publ, &publ_root, local_list) { + if (!buf) { + left = (rest <= max_item_buf) ? rest : max_item_buf; + rest -= left; + buf = named_prepare_buf(PUBLICATION, left, node); + if (buf == NULL) { + warn("Memory Squeeze; could not send publication\n"); + goto exit; + } + item = (struct distr_item *)msg_data(buf_msg(buf)); + } + publ_to_item(item, publ); + item++; + left -= ITEM_SIZE; + if (!left) { + msg_set_link_selector(buf_msg(buf), node); + dbg("named_node_up: sending publish msg to " + "<%u.%u.%u>\n", tipc_zone(node), + tipc_cluster(node), tipc_node(node)); + link_send(buf, node, node); + buf = 0; + } + } +exit: + read_unlock_bh(&nametbl_lock); +} + +/** + * node_is_down - remove publication associated with a failed node + * + * Invoked for each publication issued by a newly failed node. + * Removes publication structure from name table & deletes it. + * In rare cases the link may have come back up again when this + * function is called, and we have two items representing the same + * publication. Nudge this item's key to distinguish it from the other. + * (Note: Publication's node subscription is already unsubscribed.) + */ + +static void node_is_down(struct publication *publ) +{ + struct publication *p; + write_lock_bh(&nametbl_lock); + dbg("node_is_down: withdrawing %u, %u, %u\n", + publ->type, publ->lower, publ->upper); + publ->key += 1222345; + p = nametbl_remove_publ(publ->type, publ->lower, + publ->node, publ->ref, publ->key); + assert(p == publ); + write_unlock_bh(&nametbl_lock); + if (publ) + kfree(publ); +} + +/** + * named_recv - process name table update message sent by another node + */ + +void named_recv(struct sk_buff *buf) +{ + struct publication *publ; + struct tipc_msg *msg = buf_msg(buf); + struct distr_item *item = (struct distr_item *)msg_data(msg); + u32 count = msg_data_sz(msg) / ITEM_SIZE; + + write_lock_bh(&nametbl_lock); + while (count--) { + if (msg_type(msg) == PUBLICATION) { + dbg("named_recv: got publication for %u, %u, %u\n", + ntohl(item->type), ntohl(item->lower), + ntohl(item->upper)); + publ = nametbl_insert_publ(ntohl(item->type), + ntohl(item->lower), + ntohl(item->upper), + TIPC_CLUSTER_SCOPE, + msg_orignode(msg), + ntohl(item->ref), + ntohl(item->key)); + if (publ) { + nodesub_subscribe(&publ->subscr, + msg_orignode(msg), + publ, + (net_ev_handler)node_is_down); + } + } else if (msg_type(msg) == WITHDRAWAL) { + dbg("named_recv: got withdrawl for %u, %u, %u\n", + ntohl(item->type), ntohl(item->lower), + ntohl(item->upper)); + publ = nametbl_remove_publ(ntohl(item->type), + ntohl(item->lower), + msg_orignode(msg), + ntohl(item->ref), + ntohl(item->key)); + + if (publ) { + nodesub_unsubscribe(&publ->subscr); + kfree(publ); + } + } else { + warn("named_recv: unknown msg\n"); + } + item++; + } + write_unlock_bh(&nametbl_lock); + buf_discard(buf); +} + +/** + * named_reinit - re-initialize local publication list + * + * This routine is called whenever TIPC networking is (re)enabled. + * All existing publications by this node that have "cluster" or "zone" scope + * are updated to reflect the node's current network address. + * (If the node's address is unchanged, the update loop terminates immediately.) + */ + +void named_reinit(void) +{ + struct publication *publ; + + write_lock_bh(&nametbl_lock); + list_for_each_entry(publ, &publ_root, local_list) { + if (publ->node == tipc_own_addr) + break; + publ->node = tipc_own_addr; + } + write_unlock_bh(&nametbl_lock); +} diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h new file mode 100644 index 000000000000..a04bdeac84ea --- /dev/null +++ b/net/tipc/name_distr.h @@ -0,0 +1,48 @@ +/* + * net/tipc/name_distr.h: Include file for TIPC name distribution code + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_NAME_DISTR_H +#define _TIPC_NAME_DISTR_H + +#include "name_table.h" + +void named_publish(struct publication *publ); +void named_withdraw(struct publication *publ); +void named_node_up(unsigned long node); +void named_recv(struct sk_buff *buf); +void named_reinit(void); + +#endif diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c new file mode 100644 index 000000000000..972c83eb83b4 --- /dev/null +++ b/net/tipc/name_table.c @@ -0,0 +1,1079 @@ +/* + * net/tipc/name_table.c: TIPC name table code + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include "dbg.h" +#include "name_table.h" +#include "name_distr.h" +#include "addr.h" +#include "node_subscr.h" +#include "subscr.h" +#include "port.h" +#include "cluster.h" +#include "bcast.h" + +int tipc_nametbl_size = 1024; /* must be a power of 2 */ + +/** + * struct sub_seq - container for all published instances of a name sequence + * @lower: name sequence lower bound + * @upper: name sequence upper bound + * @node_list: circular list of matching publications with >= node scope + * @cluster_list: circular list of matching publications with >= cluster scope + * @zone_list: circular list of matching publications with >= zone scope + */ + +struct sub_seq { + u32 lower; + u32 upper; + struct publication *node_list; + struct publication *cluster_list; + struct publication *zone_list; +}; + +/** + * struct name_seq - container for all published instances of a name type + * @type: 32 bit 'type' value for name sequence + * @sseq: pointer to dynamically-sized array of sub-sequences of this 'type'; + * sub-sequences are sorted in ascending order + * @alloc: number of sub-sequences currently in array + * @first_free: upper bound of highest sub-sequence + 1 + * @ns_list: links to adjacent name sequences in hash chain + * @subscriptions: list of subscriptions for this 'type' + * @lock: spinlock controlling access to name sequence structure + */ + +struct name_seq { + u32 type; + struct sub_seq *sseqs; + u32 alloc; + u32 first_free; + struct hlist_node ns_list; + struct list_head subscriptions; + spinlock_t lock; +}; + +/** + * struct name_table - table containing all existing port name publications + * @types: pointer to fixed-sized array of name sequence lists, + * accessed via hashing on 'type'; name sequence lists are *not* sorted + * @local_publ_count: number of publications issued by this node + */ + +struct name_table { + struct hlist_head *types; + u32 local_publ_count; +}; + +struct name_table table = { NULL } ; +static atomic_t rsv_publ_ok = ATOMIC_INIT(0); +rwlock_t nametbl_lock = RW_LOCK_UNLOCKED; + + +static inline int hash(int x) +{ + return(x & (tipc_nametbl_size - 1)); +} + +/** + * publ_create - create a publication structure + */ + +static struct publication *publ_create(u32 type, u32 lower, u32 upper, + u32 scope, u32 node, u32 port_ref, + u32 key) +{ + struct publication *publ = + (struct publication *)kmalloc(sizeof(*publ), GFP_ATOMIC); + if (publ == NULL) { + warn("Memory squeeze; failed to create publication\n"); + return 0; + } + + memset(publ, 0, sizeof(*publ)); + publ->type = type; + publ->lower = lower; + publ->upper = upper; + publ->scope = scope; + publ->node = node; + publ->ref = port_ref; + publ->key = key; + INIT_LIST_HEAD(&publ->local_list); + INIT_LIST_HEAD(&publ->pport_list); + INIT_LIST_HEAD(&publ->subscr.nodesub_list); + return publ; +} + +/** + * subseq_alloc - allocate a specified number of sub-sequence structures + */ + +struct sub_seq *subseq_alloc(u32 cnt) +{ + u32 sz = cnt * sizeof(struct sub_seq); + struct sub_seq *sseq = (struct sub_seq *)kmalloc(sz, GFP_ATOMIC); + + if (sseq) + memset(sseq, 0, sz); + return sseq; +} + +/** + * nameseq_create - create a name sequence structure for the specified 'type' + * + * Allocates a single sub-sequence structure and sets it to all 0's. + */ + +struct name_seq *nameseq_create(u32 type, struct hlist_head *seq_head) +{ + struct name_seq *nseq = + (struct name_seq *)kmalloc(sizeof(*nseq), GFP_ATOMIC); + struct sub_seq *sseq = subseq_alloc(1); + + if (!nseq || !sseq) { + warn("Memory squeeze; failed to create name sequence\n"); + kfree(nseq); + kfree(sseq); + return 0; + } + + memset(nseq, 0, sizeof(*nseq)); + nseq->lock = SPIN_LOCK_UNLOCKED; + nseq->type = type; + nseq->sseqs = sseq; + dbg("nameseq_create() nseq = %x type %u, ssseqs %x, ff: %u\n", + nseq, type, nseq->sseqs, nseq->first_free); + nseq->alloc = 1; + INIT_HLIST_NODE(&nseq->ns_list); + INIT_LIST_HEAD(&nseq->subscriptions); + hlist_add_head(&nseq->ns_list, seq_head); + return nseq; +} + +/** + * nameseq_find_subseq - find sub-sequence (if any) matching a name instance + * + * Very time-critical, so binary searches through sub-sequence array. + */ + +static inline struct sub_seq *nameseq_find_subseq(struct name_seq *nseq, + u32 instance) +{ + struct sub_seq *sseqs = nseq->sseqs; + int low = 0; + int high = nseq->first_free - 1; + int mid; + + while (low <= high) { + mid = (low + high) / 2; + if (instance < sseqs[mid].lower) + high = mid - 1; + else if (instance > sseqs[mid].upper) + low = mid + 1; + else + return &sseqs[mid]; + } + return 0; +} + +/** + * nameseq_locate_subseq - determine position of name instance in sub-sequence + * + * Returns index in sub-sequence array of the entry that contains the specified + * instance value; if no entry contains that value, returns the position + * where a new entry for it would be inserted in the array. + * + * Note: Similar to binary search code for locating a sub-sequence. + */ + +static u32 nameseq_locate_subseq(struct name_seq *nseq, u32 instance) +{ + struct sub_seq *sseqs = nseq->sseqs; + int low = 0; + int high = nseq->first_free - 1; + int mid; + + while (low <= high) { + mid = (low + high) / 2; + if (instance < sseqs[mid].lower) + high = mid - 1; + else if (instance > sseqs[mid].upper) + low = mid + 1; + else + return mid; + } + return low; +} + +/** + * nameseq_insert_publ - + */ + +struct publication *nameseq_insert_publ(struct name_seq *nseq, + u32 type, u32 lower, u32 upper, + u32 scope, u32 node, u32 port, u32 key) +{ + struct subscription *s; + struct subscription *st; + struct publication *publ; + struct sub_seq *sseq; + int created_subseq = 0; + + assert(nseq->first_free <= nseq->alloc); + sseq = nameseq_find_subseq(nseq, lower); + dbg("nameseq_ins: for seq %x,<%u,%u>, found sseq %x\n", + nseq, type, lower, sseq); + if (sseq) { + + /* Lower end overlaps existing entry => need an exact match */ + + if ((sseq->lower != lower) || (sseq->upper != upper)) { + warn("Overlapping publ <%u,%u,%u>\n", type, lower, upper); + return 0; + } + } else { + u32 inspos; + struct sub_seq *freesseq; + + /* Find where lower end should be inserted */ + + inspos = nameseq_locate_subseq(nseq, lower); + + /* Fail if upper end overlaps into an existing entry */ + + if ((inspos < nseq->first_free) && + (upper >= nseq->sseqs[inspos].lower)) { + warn("Overlapping publ <%u,%u,%u>\n", type, lower, upper); + return 0; + } + + /* Ensure there is space for new sub-sequence */ + + if (nseq->first_free == nseq->alloc) { + struct sub_seq *sseqs = nseq->sseqs; + nseq->sseqs = subseq_alloc(nseq->alloc * 2); + if (nseq->sseqs != NULL) { + memcpy(nseq->sseqs, sseqs, + nseq->alloc * sizeof (struct sub_seq)); + kfree(sseqs); + dbg("Allocated %u sseqs\n", nseq->alloc); + nseq->alloc *= 2; + } else { + warn("Memory squeeze; failed to create sub-sequence\n"); + return 0; + } + } + dbg("Have %u sseqs for type %u\n", nseq->alloc, type); + + /* Insert new sub-sequence */ + + dbg("ins in pos %u, ff = %u\n", inspos, nseq->first_free); + sseq = &nseq->sseqs[inspos]; + freesseq = &nseq->sseqs[nseq->first_free]; + memmove(sseq + 1, sseq, (freesseq - sseq) * sizeof (*sseq)); + memset(sseq, 0, sizeof (*sseq)); + nseq->first_free++; + sseq->lower = lower; + sseq->upper = upper; + created_subseq = 1; + } + dbg("inserting (%u %u %u) from %x:%u into sseq %x(%u,%u) of seq %x\n", + type, lower, upper, node, port, sseq, + sseq->lower, sseq->upper, nseq); + + /* Insert a publication: */ + + publ = publ_create(type, lower, upper, scope, node, port, key); + if (!publ) + return 0; + dbg("inserting publ %x, node=%x publ->node=%x, subscr->node=%x\n", + publ, node, publ->node, publ->subscr.node); + + if (!sseq->zone_list) + sseq->zone_list = publ->zone_list_next = publ; + else { + publ->zone_list_next = sseq->zone_list->zone_list_next; + sseq->zone_list->zone_list_next = publ; + } + + if (in_own_cluster(node)) { + if (!sseq->cluster_list) + sseq->cluster_list = publ->cluster_list_next = publ; + else { + publ->cluster_list_next = + sseq->cluster_list->cluster_list_next; + sseq->cluster_list->cluster_list_next = publ; + } + } + + if (node == tipc_own_addr) { + if (!sseq->node_list) + sseq->node_list = publ->node_list_next = publ; + else { + publ->node_list_next = sseq->node_list->node_list_next; + sseq->node_list->node_list_next = publ; + } + } + + /* + * Any subscriptions waiting for notification? + */ + list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { + dbg("calling report_overlap()\n"); + subscr_report_overlap(s, + publ->lower, + publ->upper, + TIPC_PUBLISHED, + publ->ref, + publ->node, + created_subseq); + } + return publ; +} + +/** + * nameseq_remove_publ - + */ + +struct publication *nameseq_remove_publ(struct name_seq *nseq, u32 inst, + u32 node, u32 ref, u32 key) +{ + struct publication *publ; + struct publication *prev; + struct sub_seq *sseq = nameseq_find_subseq(nseq, inst); + struct sub_seq *free; + struct subscription *s, *st; + int removed_subseq = 0; + + assert(nseq); + + if (!sseq) { + int i; + + warn("Withdraw unknown <%u,%u>?\n", nseq->type, inst); + assert(nseq->sseqs); + dbg("Dumping subseqs %x for %x, alloc = %u,ff=%u\n", + nseq->sseqs, nseq, nseq->alloc, + nseq->first_free); + for (i = 0; i < nseq->first_free; i++) { + dbg("Subseq %u(%x): lower = %u,upper = %u\n", + i, &nseq->sseqs[i], nseq->sseqs[i].lower, + nseq->sseqs[i].upper); + } + return 0; + } + dbg("nameseq_remove: seq: %x, sseq %x, <%u,%u> key %u\n", + nseq, sseq, nseq->type, inst, key); + + prev = sseq->zone_list; + publ = sseq->zone_list->zone_list_next; + while ((publ->key != key) || (publ->ref != ref) || + (publ->node && (publ->node != node))) { + prev = publ; + publ = publ->zone_list_next; + assert(prev != sseq->zone_list); + } + if (publ != sseq->zone_list) + prev->zone_list_next = publ->zone_list_next; + else if (publ->zone_list_next != publ) { + prev->zone_list_next = publ->zone_list_next; + sseq->zone_list = publ->zone_list_next; + } else { + sseq->zone_list = 0; + } + + if (in_own_cluster(node)) { + prev = sseq->cluster_list; + publ = sseq->cluster_list->cluster_list_next; + while ((publ->key != key) || (publ->ref != ref) || + (publ->node && (publ->node != node))) { + prev = publ; + publ = publ->cluster_list_next; + assert(prev != sseq->cluster_list); + } + if (publ != sseq->cluster_list) + prev->cluster_list_next = publ->cluster_list_next; + else if (publ->cluster_list_next != publ) { + prev->cluster_list_next = publ->cluster_list_next; + sseq->cluster_list = publ->cluster_list_next; + } else { + sseq->cluster_list = 0; + } + } + + if (node == tipc_own_addr) { + prev = sseq->node_list; + publ = sseq->node_list->node_list_next; + while ((publ->key != key) || (publ->ref != ref) || + (publ->node && (publ->node != node))) { + prev = publ; + publ = publ->node_list_next; + assert(prev != sseq->node_list); + } + if (publ != sseq->node_list) + prev->node_list_next = publ->node_list_next; + else if (publ->node_list_next != publ) { + prev->node_list_next = publ->node_list_next; + sseq->node_list = publ->node_list_next; + } else { + sseq->node_list = 0; + } + } + assert(!publ->node || (publ->node == node)); + assert(publ->ref == ref); + assert(publ->key == key); + + /* + * Contract subseq list if no more publications: + */ + if (!sseq->node_list && !sseq->cluster_list && !sseq->zone_list) { + free = &nseq->sseqs[nseq->first_free--]; + memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof (*sseq)); + removed_subseq = 1; + } + + /* + * Any subscriptions waiting ? + */ + list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { + subscr_report_overlap(s, + publ->lower, + publ->upper, + TIPC_WITHDRAWN, + publ->ref, + publ->node, + removed_subseq); + } + return publ; +} + +/** + * nameseq_subscribe: attach a subscription, and issue + * the prescribed number of events if there is any sub- + * sequence overlapping with the requested sequence + */ + +void nameseq_subscribe(struct name_seq *nseq, struct subscription *s) +{ + struct sub_seq *sseq = nseq->sseqs; + + list_add(&s->nameseq_list, &nseq->subscriptions); + + if (!sseq) + return; + + while (sseq != &nseq->sseqs[nseq->first_free]) { + struct publication *zl = sseq->zone_list; + if (zl && subscr_overlap(s,sseq->lower,sseq->upper)) { + struct publication *crs = zl; + int must_report = 1; + + do { + subscr_report_overlap(s, + sseq->lower, + sseq->upper, + TIPC_PUBLISHED, + crs->ref, + crs->node, + must_report); + must_report = 0; + crs = crs->zone_list_next; + } while (crs != zl); + } + sseq++; + } +} + +static struct name_seq *nametbl_find_seq(u32 type) +{ + struct hlist_head *seq_head; + struct hlist_node *seq_node; + struct name_seq *ns; + + dbg("find_seq %u,(%u,0x%x) table = %p, hash[type] = %u\n", + type, ntohl(type), type, table.types, hash(type)); + + seq_head = &table.types[hash(type)]; + hlist_for_each_entry(ns, seq_node, seq_head, ns_list) { + if (ns->type == type) { + dbg("found %x\n", ns); + return ns; + } + } + + return 0; +}; + +struct publication *nametbl_insert_publ(u32 type, u32 lower, u32 upper, + u32 scope, u32 node, u32 port, u32 key) +{ + struct name_seq *seq = nametbl_find_seq(type); + + dbg("ins_publ: <%u,%x,%x> found %x\n", type, lower, upper, seq); + if (lower > upper) { + warn("Failed to publish illegal <%u,%u,%u>\n", + type, lower, upper); + return 0; + } + + dbg("Publishing <%u,%u,%u> from %x\n", type, lower, upper, node); + if (!seq) { + seq = nameseq_create(type, &table.types[hash(type)]); + dbg("nametbl_insert_publ: created %x\n", seq); + } + if (!seq) + return 0; + + assert(seq->type == type); + return nameseq_insert_publ(seq, type, lower, upper, + scope, node, port, key); +} + +struct publication *nametbl_remove_publ(u32 type, u32 lower, + u32 node, u32 ref, u32 key) +{ + struct publication *publ; + struct name_seq *seq = nametbl_find_seq(type); + + if (!seq) + return 0; + + dbg("Withdrawing <%u,%u> from %x\n", type, lower, node); + publ = nameseq_remove_publ(seq, lower, node, ref, key); + + if (!seq->first_free && list_empty(&seq->subscriptions)) { + hlist_del_init(&seq->ns_list); + kfree(seq->sseqs); + kfree(seq); + } + return publ; +} + +/* + * nametbl_translate(): Translate tipc_name -> tipc_portid. + * Very time-critical. + * + * Note: on entry 'destnode' is the search domain used during translation; + * on exit it passes back the node address of the matching port (if any) + */ + +u32 nametbl_translate(u32 type, u32 instance, u32 *destnode) +{ + struct sub_seq *sseq; + struct publication *publ = 0; + struct name_seq *seq; + u32 ref; + + if (!in_scope(*destnode, tipc_own_addr)) + return 0; + + read_lock_bh(&nametbl_lock); + seq = nametbl_find_seq(type); + if (unlikely(!seq)) + goto not_found; + sseq = nameseq_find_subseq(seq, instance); + if (unlikely(!sseq)) + goto not_found; + spin_lock_bh(&seq->lock); + + /* Closest-First Algorithm: */ + if (likely(!*destnode)) { + publ = sseq->node_list; + if (publ) { + sseq->node_list = publ->node_list_next; +found: + ref = publ->ref; + *destnode = publ->node; + spin_unlock_bh(&seq->lock); + read_unlock_bh(&nametbl_lock); + return ref; + } + publ = sseq->cluster_list; + if (publ) { + sseq->cluster_list = publ->cluster_list_next; + goto found; + } + publ = sseq->zone_list; + if (publ) { + sseq->zone_list = publ->zone_list_next; + goto found; + } + } + + /* Round-Robin Algorithm: */ + else if (*destnode == tipc_own_addr) { + publ = sseq->node_list; + if (publ) { + sseq->node_list = publ->node_list_next; + goto found; + } + } else if (in_own_cluster(*destnode)) { + publ = sseq->cluster_list; + if (publ) { + sseq->cluster_list = publ->cluster_list_next; + goto found; + } + } else { + publ = sseq->zone_list; + if (publ) { + sseq->zone_list = publ->zone_list_next; + goto found; + } + } + spin_unlock_bh(&seq->lock); +not_found: + *destnode = 0; + read_unlock_bh(&nametbl_lock); + return 0; +} + +/** + * nametbl_mc_translate - find multicast destinations + * + * Creates list of all local ports that overlap the given multicast address; + * also determines if any off-node ports overlap. + * + * Note: Publications with a scope narrower than 'limit' are ignored. + * (i.e. local node-scope publications mustn't receive messages arriving + * from another node, even if the multcast link brought it here) + * + * Returns non-zero if any off-node ports overlap + */ + +int nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit, + struct port_list *dports) +{ + struct name_seq *seq; + struct sub_seq *sseq; + struct sub_seq *sseq_stop; + int res = 0; + + read_lock_bh(&nametbl_lock); + seq = nametbl_find_seq(type); + if (!seq) + goto exit; + + spin_lock_bh(&seq->lock); + + sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); + sseq_stop = seq->sseqs + seq->first_free; + for (; sseq != sseq_stop; sseq++) { + struct publication *publ; + + if (sseq->lower > upper) + break; + publ = sseq->cluster_list; + if (publ && (publ->scope <= limit)) + do { + if (publ->node == tipc_own_addr) + port_list_add(dports, publ->ref); + else + res = 1; + publ = publ->cluster_list_next; + } while (publ != sseq->cluster_list); + } + + spin_unlock_bh(&seq->lock); +exit: + read_unlock_bh(&nametbl_lock); + return res; +} + +/** + * nametbl_publish_rsv - publish port name using a reserved name type + */ + +int nametbl_publish_rsv(u32 ref, unsigned int scope, + struct tipc_name_seq const *seq) +{ + int res; + + atomic_inc(&rsv_publ_ok); + res = tipc_publish(ref, scope, seq); + atomic_dec(&rsv_publ_ok); + return res; +} + +/** + * nametbl_publish - add name publication to network name tables + */ + +struct publication *nametbl_publish(u32 type, u32 lower, u32 upper, + u32 scope, u32 port_ref, u32 key) +{ + struct publication *publ; + + if (table.local_publ_count >= tipc_max_publications) { + warn("Failed publish: max %u local publication\n", + tipc_max_publications); + return 0; + } + if ((type < TIPC_RESERVED_TYPES) && !atomic_read(&rsv_publ_ok)) { + warn("Failed to publish reserved name <%u,%u,%u>\n", + type, lower, upper); + return 0; + } + + write_lock_bh(&nametbl_lock); + table.local_publ_count++; + publ = nametbl_insert_publ(type, lower, upper, scope, + tipc_own_addr, port_ref, key); + if (publ && (scope != TIPC_NODE_SCOPE)) { + named_publish(publ); + } + write_unlock_bh(&nametbl_lock); + return publ; +} + +/** + * nametbl_withdraw - withdraw name publication from network name tables + */ + +int nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key) +{ + struct publication *publ; + + dbg("nametbl_withdraw:<%d,%d,%d>\n", type, lower, key); + write_lock_bh(&nametbl_lock); + publ = nametbl_remove_publ(type, lower, tipc_own_addr, ref, key); + if (publ) { + table.local_publ_count--; + if (publ->scope != TIPC_NODE_SCOPE) + named_withdraw(publ); + write_unlock_bh(&nametbl_lock); + list_del_init(&publ->pport_list); + kfree(publ); + return 1; + } + write_unlock_bh(&nametbl_lock); + return 0; +} + +/** + * nametbl_subscribe - add a subscription object to the name table + */ + +void +nametbl_subscribe(struct subscription *s) +{ + u32 type = s->seq.type; + struct name_seq *seq; + + write_lock_bh(&nametbl_lock); + seq = nametbl_find_seq(type); + if (!seq) { + seq = nameseq_create(type, &table.types[hash(type)]); + } + if (seq){ + spin_lock_bh(&seq->lock); + dbg("nametbl_subscribe:found %x for <%u,%u,%u>\n", + seq, type, s->seq.lower, s->seq.upper); + assert(seq->type == type); + nameseq_subscribe(seq, s); + spin_unlock_bh(&seq->lock); + } + write_unlock_bh(&nametbl_lock); +} + +/** + * nametbl_unsubscribe - remove a subscription object from name table + */ + +void +nametbl_unsubscribe(struct subscription *s) +{ + struct name_seq *seq; + + write_lock_bh(&nametbl_lock); + seq = nametbl_find_seq(s->seq.type); + if (seq != NULL){ + spin_lock_bh(&seq->lock); + list_del_init(&s->nameseq_list); + spin_unlock_bh(&seq->lock); + if ((seq->first_free == 0) && list_empty(&seq->subscriptions)) { + hlist_del_init(&seq->ns_list); + kfree(seq->sseqs); + kfree(seq); + } + } + write_unlock_bh(&nametbl_lock); +} + + +/** + * subseq_list: print specified sub-sequence contents into the given buffer + */ + +static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth, + u32 index) +{ + char portIdStr[27]; + char *scopeStr; + struct publication *publ = sseq->zone_list; + + tipc_printf(buf, "%-10u %-10u ", sseq->lower, sseq->upper); + + if (depth == 2 || !publ) { + tipc_printf(buf, "\n"); + return; + } + + do { + sprintf (portIdStr, "<%u.%u.%u:%u>", + tipc_zone(publ->node), tipc_cluster(publ->node), + tipc_node(publ->node), publ->ref); + tipc_printf(buf, "%-26s ", portIdStr); + if (depth > 3) { + if (publ->node != tipc_own_addr) + scopeStr = ""; + else if (publ->scope == TIPC_NODE_SCOPE) + scopeStr = "node"; + else if (publ->scope == TIPC_CLUSTER_SCOPE) + scopeStr = "cluster"; + else + scopeStr = "zone"; + tipc_printf(buf, "%-10u %s", publ->key, scopeStr); + } + + publ = publ->zone_list_next; + if (publ == sseq->zone_list) + break; + + tipc_printf(buf, "\n%33s", " "); + } while (1); + + tipc_printf(buf, "\n"); +} + +/** + * nameseq_list: print specified name sequence contents into the given buffer + */ + +static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth, + u32 type, u32 lowbound, u32 upbound, u32 index) +{ + struct sub_seq *sseq; + char typearea[11]; + + sprintf(typearea, "%-10u", seq->type); + + if (depth == 1) { + tipc_printf(buf, "%s\n", typearea); + return; + } + + for (sseq = seq->sseqs; sseq != &seq->sseqs[seq->first_free]; sseq++) { + if ((lowbound <= sseq->upper) && (upbound >= sseq->lower)) { + tipc_printf(buf, "%s ", typearea); + subseq_list(sseq, buf, depth, index); + sprintf(typearea, "%10s", " "); + } + } +} + +/** + * nametbl_header - print name table header into the given buffer + */ + +static void nametbl_header(struct print_buf *buf, u32 depth) +{ + tipc_printf(buf, "Type "); + + if (depth > 1) + tipc_printf(buf, "Lower Upper "); + if (depth > 2) + tipc_printf(buf, "Port Identity "); + if (depth > 3) + tipc_printf(buf, "Publication"); + + tipc_printf(buf, "\n-----------"); + + if (depth > 1) + tipc_printf(buf, "--------------------- "); + if (depth > 2) + tipc_printf(buf, "-------------------------- "); + if (depth > 3) + tipc_printf(buf, "------------------"); + + tipc_printf(buf, "\n"); +} + +/** + * nametbl_list - print specified name table contents into the given buffer + */ + +static void nametbl_list(struct print_buf *buf, u32 depth_info, + u32 type, u32 lowbound, u32 upbound) +{ + struct hlist_head *seq_head; + struct hlist_node *seq_node; + struct name_seq *seq; + int all_types; + u32 depth; + u32 i; + + all_types = (depth_info & TIPC_NTQ_ALLTYPES); + depth = (depth_info & ~TIPC_NTQ_ALLTYPES); + + if (depth == 0) + return; + + if (all_types) { + /* display all entries in name table to specified depth */ + nametbl_header(buf, depth); + lowbound = 0; + upbound = ~0; + for (i = 0; i < tipc_nametbl_size; i++) { + seq_head = &table.types[i]; + hlist_for_each_entry(seq, seq_node, seq_head, ns_list) { + nameseq_list(seq, buf, depth, seq->type, + lowbound, upbound, i); + } + } + } else { + /* display only the sequence that matches the specified type */ + if (upbound < lowbound) { + tipc_printf(buf, "invalid name sequence specified\n"); + return; + } + nametbl_header(buf, depth); + i = hash(type); + seq_head = &table.types[i]; + hlist_for_each_entry(seq, seq_node, seq_head, ns_list) { + if (seq->type == type) { + nameseq_list(seq, buf, depth, type, + lowbound, upbound, i); + break; + } + } + } +} + +void nametbl_print(struct print_buf *buf, const char *str) +{ + tipc_printf(buf, str); + read_lock_bh(&nametbl_lock); + nametbl_list(buf, 0, 0, 0, 0); + read_unlock_bh(&nametbl_lock); +} + +#define MAX_NAME_TBL_QUERY 32768 + +struct sk_buff *nametbl_get(const void *req_tlv_area, int req_tlv_space) +{ + struct sk_buff *buf; + struct tipc_name_table_query *argv; + struct tlv_desc *rep_tlv; + struct print_buf b; + int str_len; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NAME_TBL_QUERY)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + buf = cfg_reply_alloc(TLV_SPACE(MAX_NAME_TBL_QUERY)); + if (!buf) + return NULL; + + rep_tlv = (struct tlv_desc *)buf->data; + printbuf_init(&b, TLV_DATA(rep_tlv), MAX_NAME_TBL_QUERY); + argv = (struct tipc_name_table_query *)TLV_DATA(req_tlv_area); + read_lock_bh(&nametbl_lock); + nametbl_list(&b, ntohl(argv->depth), ntohl(argv->type), + ntohl(argv->lowbound), ntohl(argv->upbound)); + read_unlock_bh(&nametbl_lock); + str_len = printbuf_validate(&b); + + skb_put(buf, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + + return buf; +} + +void nametbl_dump(void) +{ + nametbl_list(CONS, 0, 0, 0, 0); +} + +int nametbl_init(void) +{ + int array_size = sizeof(struct hlist_head) * tipc_nametbl_size; + + table.types = (struct hlist_head *)kmalloc(array_size, GFP_ATOMIC); + if (!table.types) + return -ENOMEM; + + write_lock_bh(&nametbl_lock); + memset(table.types, 0, array_size); + table.local_publ_count = 0; + write_unlock_bh(&nametbl_lock); + return 0; +} + +void nametbl_stop(void) +{ + struct hlist_head *seq_head; + struct hlist_node *seq_node; + struct hlist_node *tmp; + struct name_seq *seq; + u32 i; + + if (!table.types) + return; + + write_lock_bh(&nametbl_lock); + for (i = 0; i < tipc_nametbl_size; i++) { + seq_head = &table.types[i]; + hlist_for_each_entry_safe(seq, seq_node, tmp, seq_head, ns_list) { + struct sub_seq *sseq = seq->sseqs; + + for (; sseq != &seq->sseqs[seq->first_free]; sseq++) { + struct publication *publ = sseq->zone_list; + assert(publ); + do { + struct publication *next = + publ->zone_list_next; + kfree(publ); + publ = next; + } + while (publ != sseq->zone_list); + } + } + } + kfree(table.types); + table.types = NULL; + write_unlock_bh(&nametbl_lock); +} diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h new file mode 100644 index 000000000000..f82693384f60 --- /dev/null +++ b/net/tipc/name_table.h @@ -0,0 +1,108 @@ +/* + * net/tipc/name_table.h: Include file for TIPC name table code + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_NAME_TABLE_H +#define _TIPC_NAME_TABLE_H + +#include "node_subscr.h" + +struct subscription; +struct port_list; + +/* + * TIPC name types reserved for internal TIPC use (both current and planned) + */ + +#define TIPC_ZM_SRV 3 /* zone master service name type */ + + +/** + * struct publication - info about a published (name or) name sequence + * @type: name sequence type + * @lower: name sequence lower bound + * @upper: name sequence upper bound + * @scope: scope of publication + * @node: network address of publishing port's node + * @ref: publishing port + * @key: publication key + * @subscr: subscription to "node down" event (for off-node publications only) + * @local_list: adjacent entries in list of publications made by this node + * @pport_list: adjacent entries in list of publications made by this port + * @node_list: next matching name seq publication with >= node scope + * @cluster_list: next matching name seq publication with >= cluster scope + * @zone_list: next matching name seq publication with >= zone scope + * + * Note that the node list, cluster list, and zone list are circular lists. + */ + +struct publication { + u32 type; + u32 lower; + u32 upper; + u32 scope; + u32 node; + u32 ref; + u32 key; + struct node_subscr subscr; + struct list_head local_list; + struct list_head pport_list; + struct publication *node_list_next; + struct publication *cluster_list_next; + struct publication *zone_list_next; +}; + + +extern rwlock_t nametbl_lock; + +struct sk_buff *nametbl_get(const void *req_tlv_area, int req_tlv_space); +u32 nametbl_translate(u32 type, u32 instance, u32 *node); +int nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit, + struct port_list *dports); +int nametbl_publish_rsv(u32 ref, unsigned int scope, + struct tipc_name_seq const *seq); +struct publication *nametbl_publish(u32 type, u32 lower, u32 upper, + u32 scope, u32 port_ref, u32 key); +int nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key); +struct publication *nametbl_insert_publ(u32 type, u32 lower, u32 upper, + u32 scope, u32 node, u32 ref, u32 key); +struct publication *nametbl_remove_publ(u32 type, u32 lower, + u32 node, u32 ref, u32 key); +void nametbl_subscribe(struct subscription *s); +void nametbl_unsubscribe(struct subscription *s); +int nametbl_init(void); +void nametbl_stop(void); + +#endif diff --git a/net/tipc/net.c b/net/tipc/net.c new file mode 100644 index 000000000000..6826b493c1d6 --- /dev/null +++ b/net/tipc/net.c @@ -0,0 +1,311 @@ +/* + * net/tipc/net.c: TIPC network routing code + * + * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "bearer.h" +#include "net.h" +#include "zone.h" +#include "addr.h" +#include "name_table.h" +#include "name_distr.h" +#include "subscr.h" +#include "link.h" +#include "msg.h" +#include "port.h" +#include "bcast.h" +#include "discover.h" +#include "config.h" + +/* + * The TIPC locking policy is designed to ensure a very fine locking + * granularity, permitting complete parallel access to individual + * port and node/link instances. The code consists of three major + * locking domains, each protected with their own disjunct set of locks. + * + * 1: The routing hierarchy. + * Comprises the structures 'zone', 'cluster', 'node', 'link' + * and 'bearer'. The whole hierarchy is protected by a big + * read/write lock, net_lock, to enssure that nothing is added + * or removed while code is accessing any of these structures. + * This layer must not be called from the two others while they + * hold any of their own locks. + * Neither must it itself do any upcalls to the other two before + * it has released net_lock and other protective locks. + * + * Within the net_lock domain there are two sub-domains;'node' and + * 'bearer', where local write operations are permitted, + * provided that those are protected by individual spin_locks + * per instance. Code holding net_lock(read) and a node spin_lock + * is permitted to poke around in both the node itself and its + * subordinate links. I.e, it can update link counters and queues, + * change link state, send protocol messages, and alter the + * "active_links" array in the node; but it can _not_ remove a link + * or a node from the overall structure. + * Correspondingly, individual bearers may change status within a + * net_lock(read), protected by an individual spin_lock ber bearer + * instance, but it needs net_lock(write) to remove/add any bearers. + * + * + * 2: The transport level of the protocol. + * This consists of the structures port, (and its user level + * representations, such as user_port and tipc_sock), reference and + * tipc_user (port.c, reg.c, socket.c). + * + * This layer has four different locks: + * - The tipc_port spin_lock. This is protecting each port instance + * from parallel data access and removal. Since we can not place + * this lock in the port itself, it has been placed in the + * corresponding reference table entry, which has the same life + * cycle as the module. This entry is difficult to access from + * outside the TIPC core, however, so a pointer to the lock has + * been added in the port instance, -to be used for unlocking + * only. + * - A read/write lock to protect the reference table itself (teg.c). + * (Nobody is using read-only access to this, so it can just as + * well be changed to a spin_lock) + * - A spin lock to protect the registry of kernel/driver users (reg.c) + * - A global spin_lock (port_lock), which only task is to ensure + * consistency where more than one port is involved in an operation, + * i.e., whe a port is part of a linked list of ports. + * There are two such lists; 'port_list', which is used for management, + * and 'wait_list', which is used to queue ports during congestion. + * + * 3: The name table (name_table.c, name_distr.c, subscription.c) + * - There is one big read/write-lock (nametbl_lock) protecting the + * overall name table structure. Nothing must be added/removed to + * this structure without holding write access to it. + * - There is one local spin_lock per sub_sequence, which can be seen + * as a sub-domain to the nametbl_lock domain. It is used only + * for translation operations, and is needed because a translation + * steps the root of the 'publication' linked list between each lookup. + * This is always used within the scope of a nametbl_lock(read). + * - A local spin_lock protecting the queue of subscriber events. +*/ + +rwlock_t net_lock = RW_LOCK_UNLOCKED; +struct network net = { 0 }; + +struct node *net_select_remote_node(u32 addr, u32 ref) +{ + return zone_select_remote_node(net.zones[tipc_zone(addr)], addr, ref); +} + +u32 net_select_router(u32 addr, u32 ref) +{ + return zone_select_router(net.zones[tipc_zone(addr)], addr, ref); +} + + +u32 net_next_node(u32 a) +{ + if (net.zones[tipc_zone(a)]) + return zone_next_node(a); + return 0; +} + +void net_remove_as_router(u32 router) +{ + u32 z_num; + + for (z_num = 1; z_num <= tipc_max_zones; z_num++) { + if (!net.zones[z_num]) + continue; + zone_remove_as_router(net.zones[z_num], router); + } +} + +void net_send_external_routes(u32 dest) +{ + u32 z_num; + + for (z_num = 1; z_num <= tipc_max_zones; z_num++) { + if (net.zones[z_num]) + zone_send_external_routes(net.zones[z_num], dest); + } +} + +int net_init(void) +{ + u32 sz = sizeof(struct _zone *) * (tipc_max_zones + 1); + + memset(&net, 0, sizeof(net)); + net.zones = (struct _zone **)kmalloc(sz, GFP_ATOMIC); + if (!net.zones) { + return -ENOMEM; + } + memset(net.zones, 0, sz); + return TIPC_OK; +} + +void net_stop(void) +{ + u32 z_num; + + if (!net.zones) + return; + + for (z_num = 1; z_num <= tipc_max_zones; z_num++) { + zone_delete(net.zones[z_num]); + } + kfree(net.zones); + net.zones = 0; +} + +static void net_route_named_msg(struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + u32 dnode; + u32 dport; + + if (!msg_named(msg)) { + msg_dbg(msg, "net->drop_nam:"); + buf_discard(buf); + return; + } + + dnode = addr_domain(msg_lookup_scope(msg)); + dport = nametbl_translate(msg_nametype(msg), msg_nameinst(msg), &dnode); + dbg("net->lookup<%u,%u>-><%u,%x>\n", + msg_nametype(msg), msg_nameinst(msg), dport, dnode); + if (dport) { + msg_set_destnode(msg, dnode); + msg_set_destport(msg, dport); + net_route_msg(buf); + return; + } + msg_dbg(msg, "net->rej:NO NAME: "); + tipc_reject_msg(buf, TIPC_ERR_NO_NAME); +} + +void net_route_msg(struct sk_buff *buf) +{ + struct tipc_msg *msg; + u32 dnode; + + if (!buf) + return; + msg = buf_msg(buf); + + msg_incr_reroute_cnt(msg); + if (msg_reroute_cnt(msg) > 6) { + if (msg_errcode(msg)) { + msg_dbg(msg, "NET>DISC>:"); + buf_discard(buf); + } else { + msg_dbg(msg, "NET>REJ>:"); + tipc_reject_msg(buf, msg_destport(msg) ? + TIPC_ERR_NO_PORT : TIPC_ERR_NO_NAME); + } + return; + } + + msg_dbg(msg, "net->rout: "); + + /* Handle message for this node */ + dnode = msg_short(msg) ? tipc_own_addr : msg_destnode(msg); + if (in_scope(dnode, tipc_own_addr)) { + if (msg_isdata(msg)) { + if (msg_mcast(msg)) + port_recv_mcast(buf, NULL); + else if (msg_destport(msg)) + port_recv_msg(buf); + else + net_route_named_msg(buf); + return; + } + switch (msg_user(msg)) { + case ROUTE_DISTRIBUTOR: + cluster_recv_routing_table(buf); + break; + case NAME_DISTRIBUTOR: + named_recv(buf); + break; + case CONN_MANAGER: + port_recv_proto_msg(buf); + break; + default: + msg_dbg(msg,"DROP/NET/<REC<"); + buf_discard(buf); + } + return; + } + + /* Handle message for another node */ + msg_dbg(msg, "NET>SEND>: "); + link_send(buf, dnode, msg_link_selector(msg)); +} + +int tipc_start_net(void) +{ + char addr_string[16]; + int res; + + if (tipc_mode != TIPC_NODE_MODE) + return -ENOPROTOOPT; + + tipc_mode = TIPC_NET_MODE; + named_reinit(); + port_reinit(); + + if ((res = bearer_init()) || + (res = net_init()) || + (res = cluster_init()) || + (res = bclink_init())) { + return res; + } + subscr_stop(); + cfg_stop(); + k_signal((Handler)subscr_start, 0); + k_signal((Handler)cfg_init, 0); + info("Started in network mode\n"); + info("Own node address %s, network identity %u\n", + addr_string_fill(addr_string, tipc_own_addr), tipc_net_id); + return TIPC_OK; +} + +void tipc_stop_net(void) +{ + if (tipc_mode != TIPC_NET_MODE) + return; + write_lock_bh(&net_lock); + bearer_stop(); + tipc_mode = TIPC_NODE_MODE; + bclink_stop(); + net_stop(); + write_unlock_bh(&net_lock); + info("Left network mode \n"); +} + diff --git a/net/tipc/net.h b/net/tipc/net.h new file mode 100644 index 000000000000..948c6d42102c --- /dev/null +++ b/net/tipc/net.h @@ -0,0 +1,66 @@ +/* + * net/tipc/net.h: Include file for TIPC network routing code + * + * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_NET_H +#define _TIPC_NET_H + +struct _zone; + +/** + * struct network - TIPC network structure + * @zones: array of pointers to all zones within network + */ + +struct network { + struct _zone **zones; +}; + + +extern struct network net; +extern rwlock_t net_lock; + +int net_init(void); +void net_stop(void); +void net_remove_as_router(u32 router); +void net_send_external_routes(u32 dest); +void net_route_msg(struct sk_buff *buf); +struct node *net_select_remote_node(u32 addr, u32 ref); +u32 net_select_router(u32 addr, u32 ref); + +int tipc_start_net(void); +void tipc_stop_net(void); + +#endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c new file mode 100644 index 000000000000..19b3f4022532 --- /dev/null +++ b/net/tipc/netlink.c @@ -0,0 +1,112 @@ +/* + * net/tipc/netlink.c: TIPC configuration handling + * + * Copyright (c) 2005-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include <net/genetlink.h> + +static int handle_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *rep_buf; + struct nlmsghdr *rep_nlh; + struct nlmsghdr *req_nlh = info->nlhdr; + struct tipc_genlmsghdr *req_userhdr = info->userhdr; + int hdr_space = NLMSG_SPACE(GENL_HDRLEN + TIPC_GENL_HDRLEN); + + if ((req_userhdr->cmd & 0xC000) && (!capable(CAP_NET_ADMIN))) + rep_buf = cfg_reply_error_string(TIPC_CFG_NOT_NET_ADMIN); + else + rep_buf = cfg_do_cmd(req_userhdr->dest, + req_userhdr->cmd, + NLMSG_DATA(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN, + NLMSG_PAYLOAD(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN), + hdr_space); + + if (rep_buf) { + skb_push(rep_buf, hdr_space); + rep_nlh = (struct nlmsghdr *)rep_buf->data; + memcpy(rep_nlh, req_nlh, hdr_space); + rep_nlh->nlmsg_len = rep_buf->len; + genlmsg_unicast(rep_buf, req_nlh->nlmsg_pid); + } + + return 0; +} + +static struct genl_family family = { + .id = GENL_ID_GENERATE, + .name = TIPC_GENL_NAME, + .version = TIPC_GENL_VERSION, + .hdrsize = TIPC_GENL_HDRLEN, + .maxattr = 0, +}; + +static struct genl_ops ops = { + .cmd = TIPC_GENL_CMD, + .doit = handle_cmd, +}; + +static int family_registered = 0; + +int netlink_start(void) +{ + + + if (genl_register_family(&family)) + goto err; + + family_registered = 1; + + if (genl_register_ops(&family, &ops)) + goto err_unregister; + + return 0; + + err_unregister: + genl_unregister_family(&family); + family_registered = 0; + err: + err("Failed to register netlink interface\n"); + return -EFAULT; +} + +void netlink_stop(void) +{ + if (family_registered) { + genl_unregister_family(&family); + family_registered = 0; + } +} diff --git a/net/tipc/node.c b/net/tipc/node.c new file mode 100644 index 000000000000..05688d01138b --- /dev/null +++ b/net/tipc/node.c @@ -0,0 +1,679 @@ +/* + * net/tipc/node.c: TIPC node management routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include "node.h" +#include "cluster.h" +#include "net.h" +#include "addr.h" +#include "node_subscr.h" +#include "link.h" +#include "port.h" +#include "bearer.h" +#include "name_distr.h" +#include "net.h" + +void node_print(struct print_buf *buf, struct node *n_ptr, char *str); +static void node_lost_contact(struct node *n_ptr); +static void node_established_contact(struct node *n_ptr); + +struct node *nodes = NULL; /* sorted list of nodes within cluster */ + +u32 tipc_own_tag = 0; + +struct node *node_create(u32 addr) +{ + struct cluster *c_ptr; + struct node *n_ptr; + struct node **curr_node; + + n_ptr = kmalloc(sizeof(*n_ptr),GFP_ATOMIC); + if (n_ptr != NULL) { + memset(n_ptr, 0, sizeof(*n_ptr)); + n_ptr->addr = addr; + n_ptr->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&n_ptr->nsub); + + c_ptr = cluster_find(addr); + if (c_ptr == NULL) + c_ptr = cluster_create(addr); + if (c_ptr != NULL) { + n_ptr->owner = c_ptr; + cluster_attach_node(c_ptr, n_ptr); + n_ptr->last_router = -1; + + /* Insert node into ordered list */ + for (curr_node = &nodes; *curr_node; + curr_node = &(*curr_node)->next) { + if (addr < (*curr_node)->addr) { + n_ptr->next = *curr_node; + break; + } + } + (*curr_node) = n_ptr; + } else { + kfree(n_ptr); + n_ptr = NULL; + } + } + return n_ptr; +} + +void node_delete(struct node *n_ptr) +{ + if (!n_ptr) + return; + +#if 0 + /* Not needed because links are already deleted via bearer_stop() */ + + u32 l_num; + + for (l_num = 0; l_num < MAX_BEARERS; l_num++) { + link_delete(n_ptr->links[l_num]); + } +#endif + + dbg("node %x deleted\n", n_ptr->addr); + kfree(n_ptr); +} + + +/** + * node_link_up - handle addition of link + * + * Link becomes active (alone or shared) or standby, depending on its priority. + */ + +void node_link_up(struct node *n_ptr, struct link *l_ptr) +{ + struct link **active = &n_ptr->active_links[0]; + + info("Established link <%s> on network plane %c\n", + l_ptr->name, l_ptr->b_ptr->net_plane); + + if (!active[0]) { + dbg(" link %x into %x/%x\n", l_ptr, &active[0], &active[1]); + active[0] = active[1] = l_ptr; + node_established_contact(n_ptr); + return; + } + if (l_ptr->priority < active[0]->priority) { + info("Link is standby\n"); + return; + } + link_send_duplicate(active[0], l_ptr); + if (l_ptr->priority == active[0]->priority) { + active[0] = l_ptr; + return; + } + info("Link <%s> on network plane %c becomes standby\n", + active[0]->name, active[0]->b_ptr->net_plane); + active[0] = active[1] = l_ptr; +} + +/** + * node_select_active_links - select active link + */ + +static void node_select_active_links(struct node *n_ptr) +{ + struct link **active = &n_ptr->active_links[0]; + u32 i; + u32 highest_prio = 0; + + active[0] = active[1] = 0; + + for (i = 0; i < MAX_BEARERS; i++) { + struct link *l_ptr = n_ptr->links[i]; + + if (!l_ptr || !link_is_up(l_ptr) || + (l_ptr->priority < highest_prio)) + continue; + + if (l_ptr->priority > highest_prio) { + highest_prio = l_ptr->priority; + active[0] = active[1] = l_ptr; + } else { + active[1] = l_ptr; + } + } +} + +/** + * node_link_down - handle loss of link + */ + +void node_link_down(struct node *n_ptr, struct link *l_ptr) +{ + struct link **active; + + if (!link_is_active(l_ptr)) { + info("Lost standby link <%s> on network plane %c\n", + l_ptr->name, l_ptr->b_ptr->net_plane); + return; + } + info("Lost link <%s> on network plane %c\n", + l_ptr->name, l_ptr->b_ptr->net_plane); + + active = &n_ptr->active_links[0]; + if (active[0] == l_ptr) + active[0] = active[1]; + if (active[1] == l_ptr) + active[1] = active[0]; + if (active[0] == l_ptr) + node_select_active_links(n_ptr); + if (node_is_up(n_ptr)) + link_changeover(l_ptr); + else + node_lost_contact(n_ptr); +} + +int node_has_active_links(struct node *n_ptr) +{ + return (n_ptr && + ((n_ptr->active_links[0]) || (n_ptr->active_links[1]))); +} + +int node_has_redundant_links(struct node *n_ptr) +{ + return (node_has_active_links(n_ptr) && + (n_ptr->active_links[0] != n_ptr->active_links[1])); +} + +int node_has_active_routes(struct node *n_ptr) +{ + return (n_ptr && (n_ptr->last_router >= 0)); +} + +int node_is_up(struct node *n_ptr) +{ + return (node_has_active_links(n_ptr) || node_has_active_routes(n_ptr)); +} + +struct node *node_attach_link(struct link *l_ptr) +{ + struct node *n_ptr = node_find(l_ptr->addr); + + if (!n_ptr) + n_ptr = node_create(l_ptr->addr); + if (n_ptr) { + u32 bearer_id = l_ptr->b_ptr->identity; + char addr_string[16]; + + assert(bearer_id < MAX_BEARERS); + if (n_ptr->link_cnt >= 2) { + char addr_string[16]; + + err("Attempt to create third link to %s\n", + addr_string_fill(addr_string, n_ptr->addr)); + return 0; + } + + if (!n_ptr->links[bearer_id]) { + n_ptr->links[bearer_id] = l_ptr; + net.zones[tipc_zone(l_ptr->addr)]->links++; + n_ptr->link_cnt++; + return n_ptr; + } + err("Attempt to establish second link on <%s> to <%s> \n", + l_ptr->b_ptr->publ.name, + addr_string_fill(addr_string, l_ptr->addr)); + } + return 0; +} + +void node_detach_link(struct node *n_ptr, struct link *l_ptr) +{ + n_ptr->links[l_ptr->b_ptr->identity] = 0; + net.zones[tipc_zone(l_ptr->addr)]->links--; + n_ptr->link_cnt--; +} + +/* + * Routing table management - five cases to handle: + * + * 1: A link towards a zone/cluster external node comes up. + * => Send a multicast message updating routing tables of all + * system nodes within own cluster that the new destination + * can be reached via this node. + * (node.establishedContact()=>cluster.multicastNewRoute()) + * + * 2: A link towards a slave node comes up. + * => Send a multicast message updating routing tables of all + * system nodes within own cluster that the new destination + * can be reached via this node. + * (node.establishedContact()=>cluster.multicastNewRoute()) + * => Send a message to the slave node about existence + * of all system nodes within cluster: + * (node.establishedContact()=>cluster.sendLocalRoutes()) + * + * 3: A new cluster local system node becomes available. + * => Send message(s) to this particular node containing + * information about all cluster external and slave + * nodes which can be reached via this node. + * (node.establishedContact()==>network.sendExternalRoutes()) + * (node.establishedContact()==>network.sendSlaveRoutes()) + * => Send messages to all directly connected slave nodes + * containing information about the existence of the new node + * (node.establishedContact()=>cluster.multicastNewRoute()) + * + * 4: The link towards a zone/cluster external node or slave + * node goes down. + * => Send a multcast message updating routing tables of all + * nodes within cluster that the new destination can not any + * longer be reached via this node. + * (node.lostAllLinks()=>cluster.bcastLostRoute()) + * + * 5: A cluster local system node becomes unavailable. + * => Remove all references to this node from the local + * routing tables. Note: This is a completely node + * local operation. + * (node.lostAllLinks()=>network.removeAsRouter()) + * => Send messages to all directly connected slave nodes + * containing information about loss of the node + * (node.establishedContact()=>cluster.multicastLostRoute()) + * + */ + +static void node_established_contact(struct node *n_ptr) +{ + struct cluster *c_ptr; + + dbg("node_established_contact:-> %x\n", n_ptr->addr); + if (!node_has_active_routes(n_ptr)) { + k_signal((Handler)named_node_up, n_ptr->addr); + } + + /* Syncronize broadcast acks */ + n_ptr->bclink.acked = bclink_get_last_sent(); + + if (is_slave(tipc_own_addr)) + return; + if (!in_own_cluster(n_ptr->addr)) { + /* Usage case 1 (see above) */ + c_ptr = cluster_find(tipc_own_addr); + if (!c_ptr) + c_ptr = cluster_create(tipc_own_addr); + if (c_ptr) + cluster_bcast_new_route(c_ptr, n_ptr->addr, 1, + tipc_max_nodes); + return; + } + + c_ptr = n_ptr->owner; + if (is_slave(n_ptr->addr)) { + /* Usage case 2 (see above) */ + cluster_bcast_new_route(c_ptr, n_ptr->addr, 1, tipc_max_nodes); + cluster_send_local_routes(c_ptr, n_ptr->addr); + return; + } + + if (n_ptr->bclink.supported) { + nmap_add(&cluster_bcast_nodes, n_ptr->addr); + if (n_ptr->addr < tipc_own_addr) + tipc_own_tag++; + } + + /* Case 3 (see above) */ + net_send_external_routes(n_ptr->addr); + cluster_send_slave_routes(c_ptr, n_ptr->addr); + cluster_bcast_new_route(c_ptr, n_ptr->addr, LOWEST_SLAVE, + highest_allowed_slave); +} + +static void node_lost_contact(struct node *n_ptr) +{ + struct cluster *c_ptr; + struct node_subscr *ns, *tns; + char addr_string[16]; + u32 i; + + /* Clean up broadcast reception remains */ + n_ptr->bclink.gap_after = n_ptr->bclink.gap_to = 0; + while (n_ptr->bclink.deferred_head) { + struct sk_buff* buf = n_ptr->bclink.deferred_head; + n_ptr->bclink.deferred_head = buf->next; + buf_discard(buf); + } + if (n_ptr->bclink.defragm) { + buf_discard(n_ptr->bclink.defragm); + n_ptr->bclink.defragm = NULL; + } + if (in_own_cluster(n_ptr->addr) && n_ptr->bclink.supported) { + bclink_acknowledge(n_ptr, mod(n_ptr->bclink.acked + 10000)); + } + + /* Update routing tables */ + if (is_slave(tipc_own_addr)) { + net_remove_as_router(n_ptr->addr); + } else { + if (!in_own_cluster(n_ptr->addr)) { + /* Case 4 (see above) */ + c_ptr = cluster_find(tipc_own_addr); + cluster_bcast_lost_route(c_ptr, n_ptr->addr, 1, + tipc_max_nodes); + } else { + /* Case 5 (see above) */ + c_ptr = cluster_find(n_ptr->addr); + if (is_slave(n_ptr->addr)) { + cluster_bcast_lost_route(c_ptr, n_ptr->addr, 1, + tipc_max_nodes); + } else { + if (n_ptr->bclink.supported) { + nmap_remove(&cluster_bcast_nodes, + n_ptr->addr); + if (n_ptr->addr < tipc_own_addr) + tipc_own_tag--; + } + net_remove_as_router(n_ptr->addr); + cluster_bcast_lost_route(c_ptr, n_ptr->addr, + LOWEST_SLAVE, + highest_allowed_slave); + } + } + } + if (node_has_active_routes(n_ptr)) + return; + + info("Lost contact with %s\n", + addr_string_fill(addr_string, n_ptr->addr)); + + /* Abort link changeover */ + for (i = 0; i < MAX_BEARERS; i++) { + struct link *l_ptr = n_ptr->links[i]; + if (!l_ptr) + continue; + l_ptr->reset_checkpoint = l_ptr->next_in_no; + l_ptr->exp_msg_count = 0; + link_reset_fragments(l_ptr); + } + + /* Notify subscribers */ + list_for_each_entry_safe(ns, tns, &n_ptr->nsub, nodesub_list) { + ns->node = 0; + list_del_init(&ns->nodesub_list); + k_signal((Handler)ns->handle_node_down, + (unsigned long)ns->usr_handle); + } +} + +/** + * node_select_next_hop - find the next-hop node for a message + * + * Called by when cluster local lookup has failed. + */ + +struct node *node_select_next_hop(u32 addr, u32 selector) +{ + struct node *n_ptr; + u32 router_addr; + + if (!addr_domain_valid(addr)) + return 0; + + /* Look for direct link to destination processsor */ + n_ptr = node_find(addr); + if (n_ptr && node_has_active_links(n_ptr)) + return n_ptr; + + /* Cluster local system nodes *must* have direct links */ + if (!is_slave(addr) && in_own_cluster(addr)) + return 0; + + /* Look for cluster local router with direct link to node */ + router_addr = node_select_router(n_ptr, selector); + if (router_addr) + return node_select(router_addr, selector); + + /* Slave nodes can only be accessed within own cluster via a + known router with direct link -- if no router was found,give up */ + if (is_slave(addr)) + return 0; + + /* Inter zone/cluster -- find any direct link to remote cluster */ + addr = tipc_addr(tipc_zone(addr), tipc_cluster(addr), 0); + n_ptr = net_select_remote_node(addr, selector); + if (n_ptr && node_has_active_links(n_ptr)) + return n_ptr; + + /* Last resort -- look for any router to anywhere in remote zone */ + router_addr = net_select_router(addr, selector); + if (router_addr) + return node_select(router_addr, selector); + + return 0; +} + +/** + * node_select_router - select router to reach specified node + * + * Uses a deterministic and fair algorithm for selecting router node. + */ + +u32 node_select_router(struct node *n_ptr, u32 ref) +{ + u32 ulim; + u32 mask; + u32 start; + u32 r; + + if (!n_ptr) + return 0; + + if (n_ptr->last_router < 0) + return 0; + ulim = ((n_ptr->last_router + 1) * 32) - 1; + + /* Start entry must be random */ + mask = tipc_max_nodes; + while (mask > ulim) + mask >>= 1; + start = ref & mask; + r = start; + + /* Lookup upwards with wrap-around */ + do { + if (((n_ptr->routers[r / 32]) >> (r % 32)) & 1) + break; + } while (++r <= ulim); + if (r > ulim) { + r = 1; + do { + if (((n_ptr->routers[r / 32]) >> (r % 32)) & 1) + break; + } while (++r < start); + assert(r != start); + } + assert(r && (r <= ulim)); + return tipc_addr(own_zone(), own_cluster(), r); +} + +void node_add_router(struct node *n_ptr, u32 router) +{ + u32 r_num = tipc_node(router); + + n_ptr->routers[r_num / 32] = + ((1 << (r_num % 32)) | n_ptr->routers[r_num / 32]); + n_ptr->last_router = tipc_max_nodes / 32; + while ((--n_ptr->last_router >= 0) && + !n_ptr->routers[n_ptr->last_router]); +} + +void node_remove_router(struct node *n_ptr, u32 router) +{ + u32 r_num = tipc_node(router); + + if (n_ptr->last_router < 0) + return; /* No routes */ + + n_ptr->routers[r_num / 32] = + ((~(1 << (r_num % 32))) & (n_ptr->routers[r_num / 32])); + n_ptr->last_router = tipc_max_nodes / 32; + while ((--n_ptr->last_router >= 0) && + !n_ptr->routers[n_ptr->last_router]); + + if (!node_is_up(n_ptr)) + node_lost_contact(n_ptr); +} + +#if 0 +void node_print(struct print_buf *buf, struct node *n_ptr, char *str) +{ + u32 i; + + tipc_printf(buf, "\n\n%s", str); + for (i = 0; i < MAX_BEARERS; i++) { + if (!n_ptr->links[i]) + continue; + tipc_printf(buf, "Links[%u]: %x, ", i, n_ptr->links[i]); + } + tipc_printf(buf, "Active links: [%x,%x]\n", + n_ptr->active_links[0], n_ptr->active_links[1]); +} +#endif + +u32 tipc_available_nodes(const u32 domain) +{ + struct node *n_ptr; + u32 cnt = 0; + + for (n_ptr = nodes; n_ptr; n_ptr = n_ptr->next) { + if (!in_scope(domain, n_ptr->addr)) + continue; + if (node_is_up(n_ptr)) + cnt++; + } + return cnt; +} + +struct sk_buff *node_get_nodes(const void *req_tlv_area, int req_tlv_space) +{ + u32 domain; + struct sk_buff *buf; + struct node *n_ptr; + struct tipc_node_info node_info; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + domain = *(u32 *)TLV_DATA(req_tlv_area); + domain = ntohl(domain); + if (!addr_domain_valid(domain)) + return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (network address)"); + + if (!nodes) + return cfg_reply_none(); + + /* For now, get space for all other nodes + (will need to modify this when slave nodes are supported */ + + buf = cfg_reply_alloc(TLV_SPACE(sizeof(node_info)) * + (tipc_max_nodes - 1)); + if (!buf) + return NULL; + + /* Add TLVs for all nodes in scope */ + + for (n_ptr = nodes; n_ptr; n_ptr = n_ptr->next) { + if (!in_scope(domain, n_ptr->addr)) + continue; + node_info.addr = htonl(n_ptr->addr); + node_info.up = htonl(node_is_up(n_ptr)); + cfg_append_tlv(buf, TIPC_TLV_NODE_INFO, + &node_info, sizeof(node_info)); + } + + return buf; +} + +struct sk_buff *node_get_links(const void *req_tlv_area, int req_tlv_space) +{ + u32 domain; + struct sk_buff *buf; + struct node *n_ptr; + struct tipc_link_info link_info; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + domain = *(u32 *)TLV_DATA(req_tlv_area); + domain = ntohl(domain); + if (!addr_domain_valid(domain)) + return cfg_reply_error_string(TIPC_CFG_INVALID_VALUE + " (network address)"); + + if (!nodes) + return cfg_reply_none(); + + /* For now, get space for 2 links to all other nodes + bcast link + (will need to modify this when slave nodes are supported */ + + buf = cfg_reply_alloc(TLV_SPACE(sizeof(link_info)) * + (2 * (tipc_max_nodes - 1) + 1)); + if (!buf) + return NULL; + + /* Add TLV for broadcast link */ + + link_info.dest = tipc_own_addr & 0xfffff00; + link_info.dest = htonl(link_info.dest); + link_info.up = htonl(1); + sprintf(link_info.str, bc_link_name); + cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info)); + + /* Add TLVs for any other links in scope */ + + for (n_ptr = nodes; n_ptr; n_ptr = n_ptr->next) { + u32 i; + + if (!in_scope(domain, n_ptr->addr)) + continue; + for (i = 0; i < MAX_BEARERS; i++) { + if (!n_ptr->links[i]) + continue; + link_info.dest = htonl(n_ptr->addr); + link_info.up = htonl(link_is_up(n_ptr->links[i])); + strcpy(link_info.str, n_ptr->links[i]->name); + cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, + &link_info, sizeof(link_info)); + } + } + + return buf; +} diff --git a/net/tipc/node.h b/net/tipc/node.h new file mode 100644 index 000000000000..b39442badccf --- /dev/null +++ b/net/tipc/node.h @@ -0,0 +1,144 @@ +/* + * net/tipc/node.h: Include file for TIPC node management routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_NODE_H +#define _TIPC_NODE_H + +#include "node_subscr.h" +#include "addr.h" +#include "cluster.h" +#include "bearer.h" + +/** + * struct node - TIPC node structure + * @addr: network address of node + * @lock: spinlock governing access to structure + * @owner: pointer to cluster that node belongs to + * @next: pointer to next node in sorted list of cluster's nodes + * @nsub: list of "node down" subscriptions monitoring node + * @active_links: pointers to active links to node + * @links: pointers to all links to node + * @link_cnt: number of links to node + * @permit_changeover: non-zero if node has redundant links to this system + * @routers: bitmap (used for multicluster communication) + * @last_router: (used for multicluster communication) + * @bclink: broadcast-related info + * @supported: non-zero if node supports TIPC b'cast capability + * @acked: sequence # of last outbound b'cast message acknowledged by node + * @last_in: sequence # of last in-sequence b'cast message received from node + * @gap_after: sequence # of last message not requiring a NAK request + * @gap_to: sequence # of last message requiring a NAK request + * @nack_sync: counter that determines when NAK requests should be sent + * @deferred_head: oldest OOS b'cast message received from node + * @deferred_tail: newest OOS b'cast message received from node + * @defragm: list of partially reassembled b'cast message fragments from node + */ + +struct node { + u32 addr; + spinlock_t lock; + struct cluster *owner; + struct node *next; + struct list_head nsub; + struct link *active_links[2]; + struct link *links[MAX_BEARERS]; + int link_cnt; + int permit_changeover; + u32 routers[512/32]; + int last_router; + struct { + int supported; + u32 acked; + u32 last_in; + u32 gap_after; + u32 gap_to; + u32 nack_sync; + struct sk_buff *deferred_head; + struct sk_buff *deferred_tail; + struct sk_buff *defragm; + } bclink; +}; + +extern struct node *nodes; +extern u32 tipc_own_tag; + +struct node *node_create(u32 addr); +void node_delete(struct node *n_ptr); +struct node *node_attach_link(struct link *l_ptr); +void node_detach_link(struct node *n_ptr, struct link *l_ptr); +void node_link_down(struct node *n_ptr, struct link *l_ptr); +void node_link_up(struct node *n_ptr, struct link *l_ptr); +int node_has_active_links(struct node *n_ptr); +int node_has_redundant_links(struct node *n_ptr); +u32 node_select_router(struct node *n_ptr, u32 ref); +struct node *node_select_next_hop(u32 addr, u32 selector); +int node_is_up(struct node *n_ptr); +void node_add_router(struct node *n_ptr, u32 router); +void node_remove_router(struct node *n_ptr, u32 router); +struct sk_buff *node_get_links(const void *req_tlv_area, int req_tlv_space); +struct sk_buff *node_get_nodes(const void *req_tlv_area, int req_tlv_space); + +static inline struct node *node_find(u32 addr) +{ + if (likely(in_own_cluster(addr))) + return local_nodes[tipc_node(addr)]; + else if (addr_domain_valid(addr)) { + struct cluster *c_ptr = cluster_find(addr); + + if (c_ptr) + return c_ptr->nodes[tipc_node(addr)]; + } + return 0; +} + +static inline struct node *node_select(u32 addr, u32 selector) +{ + if (likely(in_own_cluster(addr))) + return local_nodes[tipc_node(addr)]; + return node_select_next_hop(addr, selector); +} + +static inline void node_lock(struct node *n_ptr) +{ + spin_lock_bh(&n_ptr->lock); +} + +static inline void node_unlock(struct node *n_ptr) +{ + spin_unlock_bh(&n_ptr->lock); +} + +#endif diff --git a/net/tipc/node_subscr.c b/net/tipc/node_subscr.c new file mode 100644 index 000000000000..79375927916f --- /dev/null +++ b/net/tipc/node_subscr.c @@ -0,0 +1,79 @@ +/* + * net/tipc/node_subscr.c: TIPC "node down" subscription handling + * + * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "dbg.h" +#include "node_subscr.h" +#include "node.h" +#include "addr.h" + +/** + * nodesub_subscribe - create "node down" subscription for specified node + */ + +void nodesub_subscribe(struct node_subscr *node_sub, u32 addr, + void *usr_handle, net_ev_handler handle_down) +{ + node_sub->node = 0; + if (addr == tipc_own_addr) + return; + if (!addr_node_valid(addr)) { + warn("node_subscr with illegal %x\n", addr); + return; + } + + node_sub->handle_node_down = handle_down; + node_sub->usr_handle = usr_handle; + node_sub->node = node_find(addr); + assert(node_sub->node); + node_lock(node_sub->node); + list_add_tail(&node_sub->nodesub_list, &node_sub->node->nsub); + node_unlock(node_sub->node); +} + +/** + * nodesub_unsubscribe - cancel "node down" subscription (if any) + */ + +void nodesub_unsubscribe(struct node_subscr *node_sub) +{ + if (!node_sub->node) + return; + + node_lock(node_sub->node); + list_del_init(&node_sub->nodesub_list); + node_unlock(node_sub->node); +} diff --git a/net/tipc/node_subscr.h b/net/tipc/node_subscr.h new file mode 100644 index 000000000000..a3b87ac4859b --- /dev/null +++ b/net/tipc/node_subscr.h @@ -0,0 +1,63 @@ +/* + * net/tipc/node_subscr.h: Include file for TIPC "node down" subscription handling + * + * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_NODE_SUBSCR_H +#define _TIPC_NODE_SUBSCR_H + +#include "addr.h" + +typedef void (*net_ev_handler) (void *usr_handle); + +/** + * struct node_subscr - "node down" subscription entry + * @node: ptr to node structure of interest (or NULL, if none) + * @handle_node_down: routine to invoke when node fails + * @usr_handle: argument to pass to routine when node fails + * @nodesub_list: adjacent entries in list of subscriptions for the node + */ + +struct node_subscr { + struct node *node; + net_ev_handler handle_node_down; + void *usr_handle; + struct list_head nodesub_list; +}; + +void nodesub_subscribe(struct node_subscr *node_sub, u32 addr, + void *usr_handle, net_ev_handler handle_down); +void nodesub_unsubscribe(struct node_subscr *node_sub); + +#endif diff --git a/net/tipc/port.c b/net/tipc/port.c new file mode 100644 index 000000000000..66caca7abe92 --- /dev/null +++ b/net/tipc/port.c @@ -0,0 +1,1708 @@ +/* + * net/tipc/port.c: TIPC port code + * + * Copyright (c) 1992-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "config.h" +#include "dbg.h" +#include "port.h" +#include "addr.h" +#include "link.h" +#include "node.h" +#include "port.h" +#include "name_table.h" +#include "user_reg.h" +#include "msg.h" +#include "bcast.h" + +/* Connection management: */ +#define PROBING_INTERVAL 3600000 /* [ms] => 1 h */ +#define CONFIRMED 0 +#define PROBING 1 + +#define MAX_REJECT_SIZE 1024 + +static struct sk_buff *msg_queue_head = 0; +static struct sk_buff *msg_queue_tail = 0; + +spinlock_t port_list_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t queue_lock = SPIN_LOCK_UNLOCKED; + +LIST_HEAD(ports); +static void port_handle_node_down(unsigned long ref); +static struct sk_buff* port_build_self_abort_msg(struct port *,u32 err); +static struct sk_buff* port_build_peer_abort_msg(struct port *,u32 err); +static void port_timeout(unsigned long ref); + + +static inline u32 port_peernode(struct port *p_ptr) +{ + return msg_destnode(&p_ptr->publ.phdr); +} + +static inline u32 port_peerport(struct port *p_ptr) +{ + return msg_destport(&p_ptr->publ.phdr); +} + +static inline u32 port_out_seqno(struct port *p_ptr) +{ + return msg_transp_seqno(&p_ptr->publ.phdr); +} + +static inline void port_set_out_seqno(struct port *p_ptr, u32 seqno) +{ + msg_set_transp_seqno(&p_ptr->publ.phdr,seqno); +} + +static inline void port_incr_out_seqno(struct port *p_ptr) +{ + struct tipc_msg *m = &p_ptr->publ.phdr; + + if (likely(!msg_routed(m))) + return; + msg_set_transp_seqno(m, (msg_transp_seqno(m) + 1)); +} + +/** + * tipc_multicast - send a multicast message to local and remote destinations + */ + +int tipc_multicast(u32 ref, struct tipc_name_seq const *seq, u32 domain, + u32 num_sect, struct iovec const *msg_sect) +{ + struct tipc_msg *hdr; + struct sk_buff *buf; + struct sk_buff *ibuf = NULL; + struct port_list dports = {0, NULL, }; + struct port *oport = port_deref(ref); + int ext_targets; + int res; + + if (unlikely(!oport)) + return -EINVAL; + + /* Create multicast message */ + + hdr = &oport->publ.phdr; + msg_set_type(hdr, TIPC_MCAST_MSG); + msg_set_nametype(hdr, seq->type); + msg_set_namelower(hdr, seq->lower); + msg_set_nameupper(hdr, seq->upper); + msg_set_hdr_sz(hdr, MCAST_H_SIZE); + res = msg_build(hdr, msg_sect, num_sect, MAX_MSG_SIZE, + !oport->user_port, &buf); + if (unlikely(!buf)) + return res; + + /* Figure out where to send multicast message */ + + ext_targets = nametbl_mc_translate(seq->type, seq->lower, seq->upper, + TIPC_NODE_SCOPE, &dports); + + /* Send message to destinations (duplicate it only if necessary) */ + + if (ext_targets) { + if (dports.count != 0) { + ibuf = skb_copy(buf, GFP_ATOMIC); + if (ibuf == NULL) { + port_list_free(&dports); + buf_discard(buf); + return -ENOMEM; + } + } + res = bclink_send_msg(buf); + if ((res < 0) && (dports.count != 0)) { + buf_discard(ibuf); + } + } else { + ibuf = buf; + } + + if (res >= 0) { + if (ibuf) + port_recv_mcast(ibuf, &dports); + } else { + port_list_free(&dports); + } + return res; +} + +/** + * port_recv_mcast - deliver multicast message to all destination ports + * + * If there is no port list, perform a lookup to create one + */ + +void port_recv_mcast(struct sk_buff *buf, struct port_list *dp) +{ + struct tipc_msg* msg; + struct port_list dports = {0, NULL, }; + struct port_list *item = dp; + int cnt = 0; + + assert(buf); + msg = buf_msg(buf); + + /* Create destination port list, if one wasn't supplied */ + + if (dp == NULL) { + nametbl_mc_translate(msg_nametype(msg), + msg_namelower(msg), + msg_nameupper(msg), + TIPC_CLUSTER_SCOPE, + &dports); + item = dp = &dports; + } + + /* Deliver a copy of message to each destination port */ + + if (dp->count != 0) { + if (dp->count == 1) { + msg_set_destport(msg, dp->ports[0]); + port_recv_msg(buf); + port_list_free(dp); + return; + } + for (; cnt < dp->count; cnt++) { + int index = cnt % PLSIZE; + struct sk_buff *b = skb_clone(buf, GFP_ATOMIC); + + if (b == NULL) { + warn("Buffer allocation failure\n"); + msg_dbg(msg, "LOST:"); + goto exit; + } + if ((index == 0) && (cnt != 0)) { + item = item->next; + } + msg_set_destport(buf_msg(b),item->ports[index]); + port_recv_msg(b); + } + } +exit: + buf_discard(buf); + port_list_free(dp); +} + +/** + * tipc_createport_raw - create a native TIPC port + * + * Returns local port reference + */ + +u32 tipc_createport_raw(void *usr_handle, + u32 (*dispatcher)(struct tipc_port *, struct sk_buff *), + void (*wakeup)(struct tipc_port *), + const u32 importance) +{ + struct port *p_ptr; + struct tipc_msg *msg; + u32 ref; + + p_ptr = kmalloc(sizeof(*p_ptr), GFP_ATOMIC); + if (p_ptr == NULL) { + warn("Memory squeeze; failed to create port\n"); + return 0; + } + memset(p_ptr, 0, sizeof(*p_ptr)); + ref = ref_acquire(p_ptr, &p_ptr->publ.lock); + if (!ref) { + warn("Reference Table Exhausted\n"); + kfree(p_ptr); + return 0; + } + + port_lock(ref); + p_ptr->publ.ref = ref; + msg = &p_ptr->publ.phdr; + msg_init(msg, DATA_LOW, TIPC_NAMED_MSG, TIPC_OK, LONG_H_SIZE, 0); + msg_set_orignode(msg, tipc_own_addr); + msg_set_prevnode(msg, tipc_own_addr); + msg_set_origport(msg, ref); + msg_set_importance(msg,importance); + p_ptr->last_in_seqno = 41; + p_ptr->sent = 1; + p_ptr->publ.usr_handle = usr_handle; + INIT_LIST_HEAD(&p_ptr->wait_list); + INIT_LIST_HEAD(&p_ptr->subscription.nodesub_list); + p_ptr->congested_link = 0; + p_ptr->max_pkt = MAX_PKT_DEFAULT; + p_ptr->dispatcher = dispatcher; + p_ptr->wakeup = wakeup; + p_ptr->user_port = 0; + k_init_timer(&p_ptr->timer, (Handler)port_timeout, ref); + spin_lock_bh(&port_list_lock); + INIT_LIST_HEAD(&p_ptr->publications); + INIT_LIST_HEAD(&p_ptr->port_list); + list_add_tail(&p_ptr->port_list, &ports); + spin_unlock_bh(&port_list_lock); + port_unlock(p_ptr); + return ref; +} + +int tipc_deleteport(u32 ref) +{ + struct port *p_ptr; + struct sk_buff *buf = 0; + + tipc_withdraw(ref, 0, 0); + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + + ref_discard(ref); + port_unlock(p_ptr); + + k_cancel_timer(&p_ptr->timer); + if (p_ptr->publ.connected) { + buf = port_build_peer_abort_msg(p_ptr, TIPC_ERR_NO_PORT); + nodesub_unsubscribe(&p_ptr->subscription); + } + if (p_ptr->user_port) { + reg_remove_port(p_ptr->user_port); + kfree(p_ptr->user_port); + } + + spin_lock_bh(&port_list_lock); + list_del(&p_ptr->port_list); + list_del(&p_ptr->wait_list); + spin_unlock_bh(&port_list_lock); + k_term_timer(&p_ptr->timer); + kfree(p_ptr); + dbg("Deleted port %u\n", ref); + net_route_msg(buf); + return TIPC_OK; +} + +/** + * tipc_get_port() - return port associated with 'ref' + * + * Note: Port is not locked. + */ + +struct tipc_port *tipc_get_port(const u32 ref) +{ + return (struct tipc_port *)ref_deref(ref); +} + +/** + * tipc_get_handle - return user handle associated to port 'ref' + */ + +void *tipc_get_handle(const u32 ref) +{ + struct port *p_ptr; + void * handle; + + p_ptr = port_lock(ref); + if (!p_ptr) + return 0; + handle = p_ptr->publ.usr_handle; + port_unlock(p_ptr); + return handle; +} + +static inline int port_unreliable(struct port *p_ptr) +{ + return msg_src_droppable(&p_ptr->publ.phdr); +} + +int tipc_portunreliable(u32 ref, unsigned int *isunreliable) +{ + struct port *p_ptr; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + *isunreliable = port_unreliable(p_ptr); + spin_unlock_bh(p_ptr->publ.lock); + return TIPC_OK; +} + +int tipc_set_portunreliable(u32 ref, unsigned int isunreliable) +{ + struct port *p_ptr; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + msg_set_src_droppable(&p_ptr->publ.phdr, (isunreliable != 0)); + port_unlock(p_ptr); + return TIPC_OK; +} + +static inline int port_unreturnable(struct port *p_ptr) +{ + return msg_dest_droppable(&p_ptr->publ.phdr); +} + +int tipc_portunreturnable(u32 ref, unsigned int *isunrejectable) +{ + struct port *p_ptr; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + *isunrejectable = port_unreturnable(p_ptr); + spin_unlock_bh(p_ptr->publ.lock); + return TIPC_OK; +} + +int tipc_set_portunreturnable(u32 ref, unsigned int isunrejectable) +{ + struct port *p_ptr; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + msg_set_dest_droppable(&p_ptr->publ.phdr, (isunrejectable != 0)); + port_unlock(p_ptr); + return TIPC_OK; +} + +/* + * port_build_proto_msg(): build a port level protocol + * or a connection abortion message. Called with + * tipc_port lock on. + */ +static struct sk_buff *port_build_proto_msg(u32 destport, u32 destnode, + u32 origport, u32 orignode, + u32 usr, u32 type, u32 err, + u32 seqno, u32 ack) +{ + struct sk_buff *buf; + struct tipc_msg *msg; + + buf = buf_acquire(LONG_H_SIZE); + if (buf) { + msg = buf_msg(buf); + msg_init(msg, usr, type, err, LONG_H_SIZE, destnode); + msg_set_destport(msg, destport); + msg_set_origport(msg, origport); + msg_set_destnode(msg, destnode); + msg_set_orignode(msg, orignode); + msg_set_transp_seqno(msg, seqno); + msg_set_msgcnt(msg, ack); + msg_dbg(msg, "PORT>SEND>:"); + } + return buf; +} + +int tipc_set_msg_option(struct tipc_port *tp_ptr, const char *opt, const u32 sz) +{ + msg_expand(&tp_ptr->phdr, msg_destnode(&tp_ptr->phdr)); + msg_set_options(&tp_ptr->phdr, opt, sz); + return TIPC_OK; +} + +int tipc_reject_msg(struct sk_buff *buf, u32 err) +{ + struct tipc_msg *msg = buf_msg(buf); + struct sk_buff *rbuf; + struct tipc_msg *rmsg; + int hdr_sz; + u32 imp = msg_importance(msg); + u32 data_sz = msg_data_sz(msg); + + if (data_sz > MAX_REJECT_SIZE) + data_sz = MAX_REJECT_SIZE; + if (msg_connected(msg) && (imp < TIPC_CRITICAL_IMPORTANCE)) + imp++; + msg_dbg(msg, "port->rej: "); + + /* discard rejected message if it shouldn't be returned to sender */ + if (msg_errcode(msg) || msg_dest_droppable(msg)) { + buf_discard(buf); + return data_sz; + } + + /* construct rejected message */ + if (msg_mcast(msg)) + hdr_sz = MCAST_H_SIZE; + else + hdr_sz = LONG_H_SIZE; + rbuf = buf_acquire(data_sz + hdr_sz); + if (rbuf == NULL) { + buf_discard(buf); + return data_sz; + } + rmsg = buf_msg(rbuf); + msg_init(rmsg, imp, msg_type(msg), err, hdr_sz, msg_orignode(msg)); + msg_set_destport(rmsg, msg_origport(msg)); + msg_set_prevnode(rmsg, tipc_own_addr); + msg_set_origport(rmsg, msg_destport(msg)); + if (msg_short(msg)) + msg_set_orignode(rmsg, tipc_own_addr); + else + msg_set_orignode(rmsg, msg_destnode(msg)); + msg_set_size(rmsg, data_sz + hdr_sz); + msg_set_nametype(rmsg, msg_nametype(msg)); + msg_set_nameinst(rmsg, msg_nameinst(msg)); + memcpy(rbuf->data + hdr_sz, msg_data(msg), data_sz); + + /* send self-abort message when rejecting on a connected port */ + if (msg_connected(msg)) { + struct sk_buff *abuf = 0; + struct port *p_ptr = port_lock(msg_destport(msg)); + + if (p_ptr) { + if (p_ptr->publ.connected) + abuf = port_build_self_abort_msg(p_ptr, err); + port_unlock(p_ptr); + } + net_route_msg(abuf); + } + + /* send rejected message */ + buf_discard(buf); + net_route_msg(rbuf); + return data_sz; +} + +int port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr, + struct iovec const *msg_sect, u32 num_sect, + int err) +{ + struct sk_buff *buf; + int res; + + res = msg_build(hdr, msg_sect, num_sect, MAX_MSG_SIZE, + !p_ptr->user_port, &buf); + if (!buf) + return res; + + return tipc_reject_msg(buf, err); +} + +static void port_timeout(unsigned long ref) +{ + struct port *p_ptr = port_lock(ref); + struct sk_buff *buf = 0; + + if (!p_ptr || !p_ptr->publ.connected) + return; + + /* Last probe answered ? */ + if (p_ptr->probing_state == PROBING) { + buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_PORT); + } else { + buf = port_build_proto_msg(port_peerport(p_ptr), + port_peernode(p_ptr), + p_ptr->publ.ref, + tipc_own_addr, + CONN_MANAGER, + CONN_PROBE, + TIPC_OK, + port_out_seqno(p_ptr), + 0); + port_incr_out_seqno(p_ptr); + p_ptr->probing_state = PROBING; + k_start_timer(&p_ptr->timer, p_ptr->probing_interval); + } + port_unlock(p_ptr); + net_route_msg(buf); +} + + +static void port_handle_node_down(unsigned long ref) +{ + struct port *p_ptr = port_lock(ref); + struct sk_buff* buf = 0; + + if (!p_ptr) + return; + buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_NODE); + port_unlock(p_ptr); + net_route_msg(buf); +} + + +static struct sk_buff *port_build_self_abort_msg(struct port *p_ptr, u32 err) +{ + u32 imp = msg_importance(&p_ptr->publ.phdr); + + if (!p_ptr->publ.connected) + return 0; + if (imp < TIPC_CRITICAL_IMPORTANCE) + imp++; + return port_build_proto_msg(p_ptr->publ.ref, + tipc_own_addr, + port_peerport(p_ptr), + port_peernode(p_ptr), + imp, + TIPC_CONN_MSG, + err, + p_ptr->last_in_seqno + 1, + 0); +} + + +static struct sk_buff *port_build_peer_abort_msg(struct port *p_ptr, u32 err) +{ + u32 imp = msg_importance(&p_ptr->publ.phdr); + + if (!p_ptr->publ.connected) + return 0; + if (imp < TIPC_CRITICAL_IMPORTANCE) + imp++; + return port_build_proto_msg(port_peerport(p_ptr), + port_peernode(p_ptr), + p_ptr->publ.ref, + tipc_own_addr, + imp, + TIPC_CONN_MSG, + err, + port_out_seqno(p_ptr), + 0); +} + +void port_recv_proto_msg(struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + struct port *p_ptr = port_lock(msg_destport(msg)); + u32 err = TIPC_OK; + struct sk_buff *r_buf = 0; + struct sk_buff *abort_buf = 0; + + msg_dbg(msg, "PORT<RECV<:"); + + if (!p_ptr) { + err = TIPC_ERR_NO_PORT; + } else if (p_ptr->publ.connected) { + if (port_peernode(p_ptr) != msg_orignode(msg)) + err = TIPC_ERR_NO_PORT; + if (port_peerport(p_ptr) != msg_origport(msg)) + err = TIPC_ERR_NO_PORT; + if (!err && msg_routed(msg)) { + u32 seqno = msg_transp_seqno(msg); + u32 myno = ++p_ptr->last_in_seqno; + if (seqno != myno) { + err = TIPC_ERR_NO_PORT; + abort_buf = port_build_self_abort_msg(p_ptr, err); + } + } + if (msg_type(msg) == CONN_ACK) { + int wakeup = port_congested(p_ptr) && + p_ptr->publ.congested && + p_ptr->wakeup; + p_ptr->acked += msg_msgcnt(msg); + if (port_congested(p_ptr)) + goto exit; + p_ptr->publ.congested = 0; + if (!wakeup) + goto exit; + p_ptr->wakeup(&p_ptr->publ); + goto exit; + } + } else if (p_ptr->publ.published) { + err = TIPC_ERR_NO_PORT; + } + if (err) { + r_buf = port_build_proto_msg(msg_origport(msg), + msg_orignode(msg), + msg_destport(msg), + tipc_own_addr, + DATA_HIGH, + TIPC_CONN_MSG, + err, + 0, + 0); + goto exit; + } + + /* All is fine */ + if (msg_type(msg) == CONN_PROBE) { + r_buf = port_build_proto_msg(msg_origport(msg), + msg_orignode(msg), + msg_destport(msg), + tipc_own_addr, + CONN_MANAGER, + CONN_PROBE_REPLY, + TIPC_OK, + port_out_seqno(p_ptr), + 0); + } + p_ptr->probing_state = CONFIRMED; + port_incr_out_seqno(p_ptr); +exit: + if (p_ptr) + port_unlock(p_ptr); + net_route_msg(r_buf); + net_route_msg(abort_buf); + buf_discard(buf); +} + +static void port_print(struct port *p_ptr, struct print_buf *buf, int full_id) +{ + struct publication *publ; + + if (full_id) + tipc_printf(buf, "<%u.%u.%u:%u>:", + tipc_zone(tipc_own_addr), tipc_cluster(tipc_own_addr), + tipc_node(tipc_own_addr), p_ptr->publ.ref); + else + tipc_printf(buf, "%-10u:", p_ptr->publ.ref); + + if (p_ptr->publ.connected) { + u32 dport = port_peerport(p_ptr); + u32 destnode = port_peernode(p_ptr); + + tipc_printf(buf, " connected to <%u.%u.%u:%u>", + tipc_zone(destnode), tipc_cluster(destnode), + tipc_node(destnode), dport); + if (p_ptr->publ.conn_type != 0) + tipc_printf(buf, " via {%u,%u}", + p_ptr->publ.conn_type, + p_ptr->publ.conn_instance); + } + else if (p_ptr->publ.published) { + tipc_printf(buf, " bound to"); + list_for_each_entry(publ, &p_ptr->publications, pport_list) { + if (publ->lower == publ->upper) + tipc_printf(buf, " {%u,%u}", publ->type, + publ->lower); + else + tipc_printf(buf, " {%u,%u,%u}", publ->type, + publ->lower, publ->upper); + } + } + tipc_printf(buf, "\n"); +} + +#define MAX_PORT_QUERY 32768 + +struct sk_buff *port_get_ports(void) +{ + struct sk_buff *buf; + struct tlv_desc *rep_tlv; + struct print_buf pb; + struct port *p_ptr; + int str_len; + + buf = cfg_reply_alloc(TLV_SPACE(MAX_PORT_QUERY)); + if (!buf) + return NULL; + rep_tlv = (struct tlv_desc *)buf->data; + + printbuf_init(&pb, TLV_DATA(rep_tlv), MAX_PORT_QUERY); + spin_lock_bh(&port_list_lock); + list_for_each_entry(p_ptr, &ports, port_list) { + spin_lock_bh(p_ptr->publ.lock); + port_print(p_ptr, &pb, 0); + spin_unlock_bh(p_ptr->publ.lock); + } + spin_unlock_bh(&port_list_lock); + str_len = printbuf_validate(&pb); + + skb_put(buf, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + + return buf; +} + +#if 0 + +#define MAX_PORT_STATS 2000 + +struct sk_buff *port_show_stats(const void *req_tlv_area, int req_tlv_space) +{ + u32 ref; + struct port *p_ptr; + struct sk_buff *buf; + struct tlv_desc *rep_tlv; + struct print_buf pb; + int str_len; + + if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_PORT_REF)) + return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); + + ref = *(u32 *)TLV_DATA(req_tlv_area); + ref = ntohl(ref); + + p_ptr = port_lock(ref); + if (!p_ptr) + return cfg_reply_error_string("port not found"); + + buf = cfg_reply_alloc(TLV_SPACE(MAX_PORT_STATS)); + if (!buf) { + port_unlock(p_ptr); + return NULL; + } + rep_tlv = (struct tlv_desc *)buf->data; + + printbuf_init(&pb, TLV_DATA(rep_tlv), MAX_PORT_STATS); + port_print(p_ptr, &pb, 1); + /* NEED TO FILL IN ADDITIONAL PORT STATISTICS HERE */ + port_unlock(p_ptr); + str_len = printbuf_validate(&pb); + + skb_put(buf, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + + return buf; +} + +#endif + +void port_reinit(void) +{ + struct port *p_ptr; + struct tipc_msg *msg; + + spin_lock_bh(&port_list_lock); + list_for_each_entry(p_ptr, &ports, port_list) { + msg = &p_ptr->publ.phdr; + if (msg_orignode(msg) == tipc_own_addr) + break; + msg_set_orignode(msg, tipc_own_addr); + } + spin_unlock_bh(&port_list_lock); +} + + +/* + * port_dispatcher_sigh(): Signal handler for messages destinated + * to the tipc_port interface. + */ + +static void port_dispatcher_sigh(void *dummy) +{ + struct sk_buff *buf; + + spin_lock_bh(&queue_lock); + buf = msg_queue_head; + msg_queue_head = 0; + spin_unlock_bh(&queue_lock); + + while (buf) { + struct port *p_ptr; + struct user_port *up_ptr; + struct tipc_portid orig; + struct tipc_name_seq dseq; + void *usr_handle; + int connected; + int published; + + struct sk_buff *next = buf->next; + struct tipc_msg *msg = buf_msg(buf); + u32 dref = msg_destport(msg); + + p_ptr = port_lock(dref); + if (!p_ptr) { + /* Port deleted while msg in queue */ + tipc_reject_msg(buf, TIPC_ERR_NO_PORT); + buf = next; + continue; + } + orig.ref = msg_origport(msg); + orig.node = msg_orignode(msg); + up_ptr = p_ptr->user_port; + usr_handle = up_ptr->usr_handle; + connected = p_ptr->publ.connected; + published = p_ptr->publ.published; + + if (unlikely(msg_errcode(msg))) + goto err; + + switch (msg_type(msg)) { + + case TIPC_CONN_MSG:{ + tipc_conn_msg_event cb = up_ptr->conn_msg_cb; + u32 peer_port = port_peerport(p_ptr); + u32 peer_node = port_peernode(p_ptr); + + spin_unlock_bh(p_ptr->publ.lock); + if (unlikely(!connected)) { + if (unlikely(published)) + goto reject; + tipc_connect2port(dref,&orig); + } + if (unlikely(msg_origport(msg) != peer_port)) + goto reject; + if (unlikely(msg_orignode(msg) != peer_node)) + goto reject; + if (unlikely(!cb)) + goto reject; + if (unlikely(++p_ptr->publ.conn_unacked >= + TIPC_FLOW_CONTROL_WIN)) + tipc_acknowledge(dref, + p_ptr->publ.conn_unacked); + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), + msg_data_sz(msg)); + break; + } + case TIPC_DIRECT_MSG:{ + tipc_msg_event cb = up_ptr->msg_cb; + + spin_unlock_bh(p_ptr->publ.lock); + if (unlikely(connected)) + goto reject; + if (unlikely(!cb)) + goto reject; + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), + msg_data_sz(msg), msg_importance(msg), + &orig); + break; + } + case TIPC_NAMED_MSG:{ + tipc_named_msg_event cb = up_ptr->named_msg_cb; + + spin_unlock_bh(p_ptr->publ.lock); + if (unlikely(connected)) + goto reject; + if (unlikely(!cb)) + goto reject; + if (unlikely(!published)) + goto reject; + dseq.type = msg_nametype(msg); + dseq.lower = msg_nameinst(msg); + dseq.upper = dseq.lower; + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), + msg_data_sz(msg), msg_importance(msg), + &orig, &dseq); + break; + } + } + if (buf) + buf_discard(buf); + buf = next; + continue; +err: + switch (msg_type(msg)) { + + case TIPC_CONN_MSG:{ + tipc_conn_shutdown_event cb = + up_ptr->conn_err_cb; + u32 peer_port = port_peerport(p_ptr); + u32 peer_node = port_peernode(p_ptr); + + spin_unlock_bh(p_ptr->publ.lock); + if (!connected || !cb) + break; + if (msg_origport(msg) != peer_port) + break; + if (msg_orignode(msg) != peer_node) + break; + tipc_disconnect(dref); + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), + msg_data_sz(msg), msg_errcode(msg)); + break; + } + case TIPC_DIRECT_MSG:{ + tipc_msg_err_event cb = up_ptr->err_cb; + + spin_unlock_bh(p_ptr->publ.lock); + if (connected || !cb) + break; + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), + msg_data_sz(msg), msg_errcode(msg), &orig); + break; + } + case TIPC_NAMED_MSG:{ + tipc_named_msg_err_event cb = + up_ptr->named_err_cb; + + spin_unlock_bh(p_ptr->publ.lock); + if (connected || !cb) + break; + dseq.type = msg_nametype(msg); + dseq.lower = msg_nameinst(msg); + dseq.upper = dseq.lower; + skb_pull(buf, msg_hdr_sz(msg)); + cb(usr_handle, dref, &buf, msg_data(msg), + msg_data_sz(msg), msg_errcode(msg), &dseq); + break; + } + } + if (buf) + buf_discard(buf); + buf = next; + continue; +reject: + tipc_reject_msg(buf, TIPC_ERR_NO_PORT); + buf = next; + } +} + +/* + * port_dispatcher(): Dispatcher for messages destinated + * to the tipc_port interface. Called with port locked. + */ + +static u32 port_dispatcher(struct tipc_port *dummy, struct sk_buff *buf) +{ + buf->next = NULL; + spin_lock_bh(&queue_lock); + if (msg_queue_head) { + msg_queue_tail->next = buf; + msg_queue_tail = buf; + } else { + msg_queue_tail = msg_queue_head = buf; + k_signal((Handler)port_dispatcher_sigh, 0); + } + spin_unlock_bh(&queue_lock); + return TIPC_OK; +} + +/* + * Wake up port after congestion: Called with port locked, + * + */ + +static void port_wakeup_sh(unsigned long ref) +{ + struct port *p_ptr; + struct user_port *up_ptr; + tipc_continue_event cb = 0; + void *uh = 0; + + p_ptr = port_lock(ref); + if (p_ptr) { + up_ptr = p_ptr->user_port; + if (up_ptr) { + cb = up_ptr->continue_event_cb; + uh = up_ptr->usr_handle; + } + port_unlock(p_ptr); + } + if (cb) + cb(uh, ref); +} + + +static void port_wakeup(struct tipc_port *p_ptr) +{ + k_signal((Handler)port_wakeup_sh, p_ptr->ref); +} + +void tipc_acknowledge(u32 ref, u32 ack) +{ + struct port *p_ptr; + struct sk_buff *buf = 0; + + p_ptr = port_lock(ref); + if (!p_ptr) + return; + if (p_ptr->publ.connected) { + p_ptr->publ.conn_unacked -= ack; + buf = port_build_proto_msg(port_peerport(p_ptr), + port_peernode(p_ptr), + ref, + tipc_own_addr, + CONN_MANAGER, + CONN_ACK, + TIPC_OK, + port_out_seqno(p_ptr), + ack); + } + port_unlock(p_ptr); + net_route_msg(buf); +} + +/* + * tipc_createport(): user level call. Will add port to + * registry if non-zero user_ref. + */ + +int tipc_createport(u32 user_ref, + void *usr_handle, + unsigned int importance, + tipc_msg_err_event error_cb, + tipc_named_msg_err_event named_error_cb, + tipc_conn_shutdown_event conn_error_cb, + tipc_msg_event msg_cb, + tipc_named_msg_event named_msg_cb, + tipc_conn_msg_event conn_msg_cb, + tipc_continue_event continue_event_cb,/* May be zero */ + u32 *portref) +{ + struct user_port *up_ptr; + struct port *p_ptr; + u32 ref; + + up_ptr = (struct user_port *)kmalloc(sizeof(*up_ptr), GFP_ATOMIC); + if (up_ptr == NULL) { + return -ENOMEM; + } + ref = tipc_createport_raw(0, port_dispatcher, port_wakeup, importance); + p_ptr = port_lock(ref); + if (!p_ptr) { + kfree(up_ptr); + return -ENOMEM; + } + + p_ptr->user_port = up_ptr; + up_ptr->user_ref = user_ref; + up_ptr->usr_handle = usr_handle; + up_ptr->ref = p_ptr->publ.ref; + up_ptr->err_cb = error_cb; + up_ptr->named_err_cb = named_error_cb; + up_ptr->conn_err_cb = conn_error_cb; + up_ptr->msg_cb = msg_cb; + up_ptr->named_msg_cb = named_msg_cb; + up_ptr->conn_msg_cb = conn_msg_cb; + up_ptr->continue_event_cb = continue_event_cb; + INIT_LIST_HEAD(&up_ptr->uport_list); + reg_add_port(up_ptr); + *portref = p_ptr->publ.ref; + dbg(" tipc_createport: %x with ref %u\n", p_ptr, p_ptr->publ.ref); + port_unlock(p_ptr); + return TIPC_OK; +} + +int tipc_ownidentity(u32 ref, struct tipc_portid *id) +{ + id->ref = ref; + id->node = tipc_own_addr; + return TIPC_OK; +} + +int tipc_portimportance(u32 ref, unsigned int *importance) +{ + struct port *p_ptr; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + *importance = (unsigned int)msg_importance(&p_ptr->publ.phdr); + spin_unlock_bh(p_ptr->publ.lock); + return TIPC_OK; +} + +int tipc_set_portimportance(u32 ref, unsigned int imp) +{ + struct port *p_ptr; + + if (imp > TIPC_CRITICAL_IMPORTANCE) + return -EINVAL; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + msg_set_importance(&p_ptr->publ.phdr, (u32)imp); + spin_unlock_bh(p_ptr->publ.lock); + return TIPC_OK; +} + + +int tipc_publish(u32 ref, unsigned int scope, struct tipc_name_seq const *seq) +{ + struct port *p_ptr; + struct publication *publ; + u32 key; + int res = -EINVAL; + + p_ptr = port_lock(ref); + dbg("tipc_publ %u, p_ptr = %x, conn = %x, scope = %x, " + "lower = %u, upper = %u\n", + ref, p_ptr, p_ptr->publ.connected, scope, seq->lower, seq->upper); + if (!p_ptr) + return -EINVAL; + if (p_ptr->publ.connected) + goto exit; + if (seq->lower > seq->upper) + goto exit; + if ((scope < TIPC_ZONE_SCOPE) || (scope > TIPC_NODE_SCOPE)) + goto exit; + key = ref + p_ptr->pub_count + 1; + if (key == ref) { + res = -EADDRINUSE; + goto exit; + } + publ = nametbl_publish(seq->type, seq->lower, seq->upper, + scope, p_ptr->publ.ref, key); + if (publ) { + list_add(&publ->pport_list, &p_ptr->publications); + p_ptr->pub_count++; + p_ptr->publ.published = 1; + res = TIPC_OK; + } +exit: + port_unlock(p_ptr); + return res; +} + +int tipc_withdraw(u32 ref, unsigned int scope, struct tipc_name_seq const *seq) +{ + struct port *p_ptr; + struct publication *publ; + struct publication *tpubl; + int res = -EINVAL; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + if (!p_ptr->publ.published) + goto exit; + if (!seq) { + list_for_each_entry_safe(publ, tpubl, + &p_ptr->publications, pport_list) { + nametbl_withdraw(publ->type, publ->lower, + publ->ref, publ->key); + } + res = TIPC_OK; + } else { + list_for_each_entry_safe(publ, tpubl, + &p_ptr->publications, pport_list) { + if (publ->scope != scope) + continue; + if (publ->type != seq->type) + continue; + if (publ->lower != seq->lower) + continue; + if (publ->upper != seq->upper) + break; + nametbl_withdraw(publ->type, publ->lower, + publ->ref, publ->key); + res = TIPC_OK; + break; + } + } + if (list_empty(&p_ptr->publications)) + p_ptr->publ.published = 0; +exit: + port_unlock(p_ptr); + return res; +} + +int tipc_connect2port(u32 ref, struct tipc_portid const *peer) +{ + struct port *p_ptr; + struct tipc_msg *msg; + int res = -EINVAL; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + if (p_ptr->publ.published || p_ptr->publ.connected) + goto exit; + if (!peer->ref) + goto exit; + + msg = &p_ptr->publ.phdr; + msg_set_destnode(msg, peer->node); + msg_set_destport(msg, peer->ref); + msg_set_orignode(msg, tipc_own_addr); + msg_set_origport(msg, p_ptr->publ.ref); + msg_set_transp_seqno(msg, 42); + msg_set_type(msg, TIPC_CONN_MSG); + if (!may_route(peer->node)) + msg_set_hdr_sz(msg, SHORT_H_SIZE); + else + msg_set_hdr_sz(msg, LONG_H_SIZE); + + p_ptr->probing_interval = PROBING_INTERVAL; + p_ptr->probing_state = CONFIRMED; + p_ptr->publ.connected = 1; + k_start_timer(&p_ptr->timer, p_ptr->probing_interval); + + nodesub_subscribe(&p_ptr->subscription,peer->node, + (void *)(unsigned long)ref, + (net_ev_handler)port_handle_node_down); + res = TIPC_OK; +exit: + port_unlock(p_ptr); + p_ptr->max_pkt = link_get_max_pkt(peer->node, ref); + return res; +} + +/* + * tipc_disconnect(): Disconnect port form peer. + * This is a node local operation. + */ + +int tipc_disconnect(u32 ref) +{ + struct port *p_ptr; + int res = -ENOTCONN; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + if (p_ptr->publ.connected) { + p_ptr->publ.connected = 0; + /* let timer expire on it's own to avoid deadlock! */ + nodesub_unsubscribe(&p_ptr->subscription); + res = TIPC_OK; + } + port_unlock(p_ptr); + return res; +} + +/* + * tipc_shutdown(): Send a SHUTDOWN msg to peer and disconnect + */ +int tipc_shutdown(u32 ref) +{ + struct port *p_ptr; + struct sk_buff *buf = 0; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + + if (p_ptr->publ.connected) { + u32 imp = msg_importance(&p_ptr->publ.phdr); + if (imp < TIPC_CRITICAL_IMPORTANCE) + imp++; + buf = port_build_proto_msg(port_peerport(p_ptr), + port_peernode(p_ptr), + ref, + tipc_own_addr, + imp, + TIPC_CONN_MSG, + TIPC_CONN_SHUTDOWN, + port_out_seqno(p_ptr), + 0); + } + port_unlock(p_ptr); + net_route_msg(buf); + return tipc_disconnect(ref); +} + +int tipc_isconnected(u32 ref, int *isconnected) +{ + struct port *p_ptr; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + *isconnected = p_ptr->publ.connected; + port_unlock(p_ptr); + return TIPC_OK; +} + +int tipc_peer(u32 ref, struct tipc_portid *peer) +{ + struct port *p_ptr; + int res; + + p_ptr = port_lock(ref); + if (!p_ptr) + return -EINVAL; + if (p_ptr->publ.connected) { + peer->ref = port_peerport(p_ptr); + peer->node = port_peernode(p_ptr); + res = TIPC_OK; + } else + res = -ENOTCONN; + port_unlock(p_ptr); + return res; +} + +int tipc_ref_valid(u32 ref) +{ + /* Works irrespective of type */ + return !!ref_deref(ref); +} + + +/* + * port_recv_sections(): Concatenate and deliver sectioned + * message for this node. + */ + +int port_recv_sections(struct port *sender, unsigned int num_sect, + struct iovec const *msg_sect) +{ + struct sk_buff *buf; + int res; + + res = msg_build(&sender->publ.phdr, msg_sect, num_sect, + MAX_MSG_SIZE, !sender->user_port, &buf); + if (likely(buf)) + port_recv_msg(buf); + return res; +} + +/** + * tipc_send - send message sections on connection + */ + +int tipc_send(u32 ref, unsigned int num_sect, struct iovec const *msg_sect) +{ + struct port *p_ptr; + u32 destnode; + int res; + + p_ptr = port_deref(ref); + if (!p_ptr || !p_ptr->publ.connected) + return -EINVAL; + + p_ptr->publ.congested = 1; + if (!port_congested(p_ptr)) { + destnode = port_peernode(p_ptr); + if (likely(destnode != tipc_own_addr)) + res = link_send_sections_fast(p_ptr, msg_sect, num_sect, + destnode); + else + res = port_recv_sections(p_ptr, num_sect, msg_sect); + + if (likely(res != -ELINKCONG)) { + port_incr_out_seqno(p_ptr); + p_ptr->publ.congested = 0; + p_ptr->sent++; + return res; + } + } + if (port_unreliable(p_ptr)) { + p_ptr->publ.congested = 0; + /* Just calculate msg length and return */ + return msg_calc_data_size(msg_sect, num_sect); + } + return -ELINKCONG; +} + +/** + * tipc_send_buf - send message buffer on connection + */ + +int tipc_send_buf(u32 ref, struct sk_buff *buf, unsigned int dsz) +{ + struct port *p_ptr; + struct tipc_msg *msg; + u32 destnode; + u32 hsz; + u32 sz; + u32 res; + + p_ptr = port_deref(ref); + if (!p_ptr || !p_ptr->publ.connected) + return -EINVAL; + + msg = &p_ptr->publ.phdr; + hsz = msg_hdr_sz(msg); + sz = hsz + dsz; + msg_set_size(msg, sz); + if (skb_cow(buf, hsz)) + return -ENOMEM; + + skb_push(buf, hsz); + memcpy(buf->data, (unchar *)msg, hsz); + destnode = msg_destnode(msg); + p_ptr->publ.congested = 1; + if (!port_congested(p_ptr)) { + if (likely(destnode != tipc_own_addr)) + res = tipc_send_buf_fast(buf, destnode); + else { + port_recv_msg(buf); + res = sz; + } + if (likely(res != -ELINKCONG)) { + port_incr_out_seqno(p_ptr); + p_ptr->sent++; + p_ptr->publ.congested = 0; + return res; + } + } + if (port_unreliable(p_ptr)) { + p_ptr->publ.congested = 0; + return dsz; + } + return -ELINKCONG; +} + +/** + * tipc_forward2name - forward message sections to port name + */ + +int tipc_forward2name(u32 ref, + struct tipc_name const *name, + u32 domain, + u32 num_sect, + struct iovec const *msg_sect, + struct tipc_portid const *orig, + unsigned int importance) +{ + struct port *p_ptr; + struct tipc_msg *msg; + u32 destnode = domain; + u32 destport = 0; + int res; + + p_ptr = port_deref(ref); + if (!p_ptr || p_ptr->publ.connected) + return -EINVAL; + + msg = &p_ptr->publ.phdr; + msg_set_type(msg, TIPC_NAMED_MSG); + msg_set_orignode(msg, orig->node); + msg_set_origport(msg, orig->ref); + msg_set_hdr_sz(msg, LONG_H_SIZE); + msg_set_nametype(msg, name->type); + msg_set_nameinst(msg, name->instance); + msg_set_lookup_scope(msg, addr_scope(domain)); + if (importance <= TIPC_CRITICAL_IMPORTANCE) + msg_set_importance(msg,importance); + destport = nametbl_translate(name->type, name->instance, &destnode); + msg_set_destnode(msg, destnode); + msg_set_destport(msg, destport); + + if (likely(destport || destnode)) { + p_ptr->sent++; + if (likely(destnode == tipc_own_addr)) + return port_recv_sections(p_ptr, num_sect, msg_sect); + res = link_send_sections_fast(p_ptr, msg_sect, num_sect, + destnode); + if (likely(res != -ELINKCONG)) + return res; + if (port_unreliable(p_ptr)) { + /* Just calculate msg length and return */ + return msg_calc_data_size(msg_sect, num_sect); + } + return -ELINKCONG; + } + return port_reject_sections(p_ptr, msg, msg_sect, num_sect, + TIPC_ERR_NO_NAME); +} + +/** + * tipc_send2name - send message sections to port name + */ + +int tipc_send2name(u32 ref, + struct tipc_name const *name, + unsigned int domain, + unsigned int num_sect, + struct iovec const *msg_sect) +{ + struct tipc_portid orig; + + orig.ref = ref; + orig.node = tipc_own_addr; + return tipc_forward2name(ref, name, domain, num_sect, msg_sect, &orig, + TIPC_PORT_IMPORTANCE); +} + +/** + * tipc_forward_buf2name - forward message buffer to port name + */ + +int tipc_forward_buf2name(u32 ref, + struct tipc_name const *name, + u32 domain, + struct sk_buff *buf, + unsigned int dsz, + struct tipc_portid const *orig, + unsigned int importance) +{ + struct port *p_ptr; + struct tipc_msg *msg; + u32 destnode = domain; + u32 destport = 0; + int res; + + p_ptr = (struct port *)ref_deref(ref); + if (!p_ptr || p_ptr->publ.connected) + return -EINVAL; + + msg = &p_ptr->publ.phdr; + if (importance <= TIPC_CRITICAL_IMPORTANCE) + msg_set_importance(msg, importance); + msg_set_type(msg, TIPC_NAMED_MSG); + msg_set_orignode(msg, orig->node); + msg_set_origport(msg, orig->ref); + msg_set_nametype(msg, name->type); + msg_set_nameinst(msg, name->instance); + msg_set_lookup_scope(msg, addr_scope(domain)); + msg_set_hdr_sz(msg, LONG_H_SIZE); + msg_set_size(msg, LONG_H_SIZE + dsz); + destport = nametbl_translate(name->type, name->instance, &destnode); + msg_set_destnode(msg, destnode); + msg_set_destport(msg, destport); + msg_dbg(msg, "forw2name ==> "); + if (skb_cow(buf, LONG_H_SIZE)) + return -ENOMEM; + skb_push(buf, LONG_H_SIZE); + memcpy(buf->data, (unchar *)msg, LONG_H_SIZE); + msg_dbg(buf_msg(buf),"PREP:"); + if (likely(destport || destnode)) { + p_ptr->sent++; + if (destnode == tipc_own_addr) + return port_recv_msg(buf); + res = tipc_send_buf_fast(buf, destnode); + if (likely(res != -ELINKCONG)) + return res; + if (port_unreliable(p_ptr)) + return dsz; + return -ELINKCONG; + } + return tipc_reject_msg(buf, TIPC_ERR_NO_NAME); +} + +/** + * tipc_send_buf2name - send message buffer to port name + */ + +int tipc_send_buf2name(u32 ref, + struct tipc_name const *dest, + u32 domain, + struct sk_buff *buf, + unsigned int dsz) +{ + struct tipc_portid orig; + + orig.ref = ref; + orig.node = tipc_own_addr; + return tipc_forward_buf2name(ref, dest, domain, buf, dsz, &orig, + TIPC_PORT_IMPORTANCE); +} + +/** + * tipc_forward2port - forward message sections to port identity + */ + +int tipc_forward2port(u32 ref, + struct tipc_portid const *dest, + unsigned int num_sect, + struct iovec const *msg_sect, + struct tipc_portid const *orig, + unsigned int importance) +{ + struct port *p_ptr; + struct tipc_msg *msg; + int res; + + p_ptr = port_deref(ref); + if (!p_ptr || p_ptr->publ.connected) + return -EINVAL; + + msg = &p_ptr->publ.phdr; + msg_set_type(msg, TIPC_DIRECT_MSG); + msg_set_orignode(msg, orig->node); + msg_set_origport(msg, orig->ref); + msg_set_destnode(msg, dest->node); + msg_set_destport(msg, dest->ref); + msg_set_hdr_sz(msg, DIR_MSG_H_SIZE); + if (importance <= TIPC_CRITICAL_IMPORTANCE) + msg_set_importance(msg, importance); + p_ptr->sent++; + if (dest->node == tipc_own_addr) + return port_recv_sections(p_ptr, num_sect, msg_sect); + res = link_send_sections_fast(p_ptr, msg_sect, num_sect, dest->node); + if (likely(res != -ELINKCONG)) + return res; + if (port_unreliable(p_ptr)) { + /* Just calculate msg length and return */ + return msg_calc_data_size(msg_sect, num_sect); + } + return -ELINKCONG; +} + +/** + * tipc_send2port - send message sections to port identity + */ + +int tipc_send2port(u32 ref, + struct tipc_portid const *dest, + unsigned int num_sect, + struct iovec const *msg_sect) +{ + struct tipc_portid orig; + + orig.ref = ref; + orig.node = tipc_own_addr; + return tipc_forward2port(ref, dest, num_sect, msg_sect, &orig, + TIPC_PORT_IMPORTANCE); +} + +/** + * tipc_forward_buf2port - forward message buffer to port identity + */ +int tipc_forward_buf2port(u32 ref, + struct tipc_portid const *dest, + struct sk_buff *buf, + unsigned int dsz, + struct tipc_portid const *orig, + unsigned int importance) +{ + struct port *p_ptr; + struct tipc_msg *msg; + int res; + + p_ptr = (struct port *)ref_deref(ref); + if (!p_ptr || p_ptr->publ.connected) + return -EINVAL; + + msg = &p_ptr->publ.phdr; + msg_set_type(msg, TIPC_DIRECT_MSG); + msg_set_orignode(msg, orig->node); + msg_set_origport(msg, orig->ref); + msg_set_destnode(msg, dest->node); + msg_set_destport(msg, dest->ref); + msg_set_hdr_sz(msg, DIR_MSG_H_SIZE); + if (importance <= TIPC_CRITICAL_IMPORTANCE) + msg_set_importance(msg, importance); + msg_set_size(msg, DIR_MSG_H_SIZE + dsz); + if (skb_cow(buf, DIR_MSG_H_SIZE)) + return -ENOMEM; + + skb_push(buf, DIR_MSG_H_SIZE); + memcpy(buf->data, (unchar *)msg, DIR_MSG_H_SIZE); + msg_dbg(msg, "buf2port: "); + p_ptr->sent++; + if (dest->node == tipc_own_addr) + return port_recv_msg(buf); + res = tipc_send_buf_fast(buf, dest->node); + if (likely(res != -ELINKCONG)) + return res; + if (port_unreliable(p_ptr)) + return dsz; + return -ELINKCONG; +} + +/** + * tipc_send_buf2port - send message buffer to port identity + */ + +int tipc_send_buf2port(u32 ref, + struct tipc_portid const *dest, + struct sk_buff *buf, + unsigned int dsz) +{ + struct tipc_portid orig; + + orig.ref = ref; + orig.node = tipc_own_addr; + return tipc_forward_buf2port(ref, dest, buf, dsz, &orig, + TIPC_PORT_IMPORTANCE); +} + diff --git a/net/tipc/port.h b/net/tipc/port.h new file mode 100644 index 000000000000..e829a99d3b7f --- /dev/null +++ b/net/tipc/port.h @@ -0,0 +1,209 @@ +/* + * net/tipc/port.h: Include file for TIPC port code + * + * Copyright (c) 1994-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_PORT_H +#define _TIPC_PORT_H + +#include <net/tipc/tipc_port.h> +#include "ref.h" +#include "net.h" +#include "msg.h" +#include "dbg.h" +#include "node_subscr.h" + +/** + * struct user_port - TIPC user port (used with native API) + * @user_ref: id of user who created user port + * @usr_handle: user-specified field + * @ref: object reference to associated TIPC port + * <various callback routines> + * @uport_list: adjacent user ports in list of ports held by user + */ + +struct user_port { + u32 user_ref; + void *usr_handle; + u32 ref; + tipc_msg_err_event err_cb; + tipc_named_msg_err_event named_err_cb; + tipc_conn_shutdown_event conn_err_cb; + tipc_msg_event msg_cb; + tipc_named_msg_event named_msg_cb; + tipc_conn_msg_event conn_msg_cb; + tipc_continue_event continue_event_cb; + struct list_head uport_list; +}; + +/** + * struct port - TIPC port structure + * @publ: TIPC port info available to privileged users + * @port_list: adjacent ports in TIPC's global list of ports + * @dispatcher: ptr to routine which handles received messages + * @wakeup: ptr to routine to call when port is no longer congested + * @user_port: ptr to user port associated with port (if any) + * @wait_list: adjacent ports in list of ports waiting on link congestion + * @congested_link: ptr to congested link port is waiting on + * @waiting_pkts: + * @sent: + * @acked: + * @publications: list of publications for port + * @pub_count: total # of publications port has made during its lifetime + * @max_pkt: maximum packet size "hint" used when building messages sent by port + * @probing_state: + * @probing_interval: + * @last_in_seqno: + * @timer_ref: + * @subscription: "node down" subscription used to terminate failed connections + */ + +struct port { + struct tipc_port publ; + struct list_head port_list; + u32 (*dispatcher)(struct tipc_port *, struct sk_buff *); + void (*wakeup)(struct tipc_port *); + struct user_port *user_port; + struct list_head wait_list; + struct link *congested_link; + u32 waiting_pkts; + u32 sent; + u32 acked; + struct list_head publications; + u32 pub_count; + u32 max_pkt; + u32 probing_state; + u32 probing_interval; + u32 last_in_seqno; + struct timer_list timer; + struct node_subscr subscription; +}; + +extern spinlock_t port_list_lock; +struct port_list; + +int port_recv_sections(struct port *p_ptr, u32 num_sect, + struct iovec const *msg_sect); +int port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr, + struct iovec const *msg_sect, u32 num_sect, + int err); +struct sk_buff *port_get_ports(void); +struct sk_buff *port_show_stats(const void *req_tlv_area, int req_tlv_space); +void port_recv_proto_msg(struct sk_buff *buf); +void port_recv_mcast(struct sk_buff *buf, struct port_list *dp); +void port_reinit(void); + +/** + * port_lock - lock port instance referred to and return its pointer + */ + +static inline struct port *port_lock(u32 ref) +{ + return (struct port *)ref_lock(ref); +} + +/** + * port_unlock - unlock a port instance + * + * Can use pointer instead of ref_unlock() since port is already locked. + */ + +static inline void port_unlock(struct port *p_ptr) +{ + spin_unlock_bh(p_ptr->publ.lock); +} + +static inline struct port* port_deref(u32 ref) +{ + return (struct port *)ref_deref(ref); +} + +static inline u32 peer_port(struct port *p_ptr) +{ + return msg_destport(&p_ptr->publ.phdr); +} + +static inline u32 peer_node(struct port *p_ptr) +{ + return msg_destnode(&p_ptr->publ.phdr); +} + +static inline int port_congested(struct port *p_ptr) +{ + return((p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2)); +} + +/** + * port_recv_msg - receive message from lower layer and deliver to port user + */ + +static inline int port_recv_msg(struct sk_buff *buf) +{ + struct port *p_ptr; + struct tipc_msg *msg = buf_msg(buf); + u32 destport = msg_destport(msg); + u32 dsz = msg_data_sz(msg); + u32 err; + + /* forward unresolved named message */ + if (unlikely(!destport)) { + net_route_msg(buf); + return dsz; + } + + /* validate destination & pass to port, otherwise reject message */ + p_ptr = port_lock(destport); + if (likely(p_ptr)) { + if (likely(p_ptr->publ.connected)) { + if ((unlikely(msg_origport(msg) != peer_port(p_ptr))) || + (unlikely(msg_orignode(msg) != peer_node(p_ptr))) || + (unlikely(!msg_connected(msg)))) { + err = TIPC_ERR_NO_PORT; + port_unlock(p_ptr); + goto reject; + } + } + err = p_ptr->dispatcher(&p_ptr->publ, buf); + port_unlock(p_ptr); + if (likely(!err)) + return dsz; + } else { + err = TIPC_ERR_NO_PORT; + } +reject: + dbg("port->rejecting, err = %x..\n",err); + return tipc_reject_msg(buf, err); +} + +#endif diff --git a/net/tipc/ref.c b/net/tipc/ref.c new file mode 100644 index 000000000000..944093fe246f --- /dev/null +++ b/net/tipc/ref.c @@ -0,0 +1,189 @@ +/* + * net/tipc/ref.c: TIPC object registry code + * + * Copyright (c) 1991-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "ref.h" +#include "port.h" +#include "subscr.h" +#include "name_distr.h" +#include "name_table.h" +#include "config.h" +#include "discover.h" +#include "bearer.h" +#include "node.h" +#include "bcast.h" + +/* + * Object reference table consists of 2**N entries. + * + * A used entry has object ptr != 0, reference == XXXX|own index + * (XXXX changes each time entry is acquired) + * A free entry has object ptr == 0, reference == YYYY|next free index + * (YYYY is one more than last used XXXX) + * + * Free list is initially chained from entry (2**N)-1 to entry 1. + * Entry 0 is not used to allow index 0 to indicate the end of the free list. + * + * Note: Any accidental reference of the form XXXX|0--0 won't match entry 0 + * because entry 0's reference field has the form XXXX|1--1. + */ + +struct ref_table ref_table = { 0 }; + +rwlock_t reftbl_lock = RW_LOCK_UNLOCKED; + +/** + * ref_table_init - create reference table for objects + */ + +int ref_table_init(u32 requested_size, u32 start) +{ + struct reference *table; + u32 sz = 1 << 4; + u32 index_mask; + int i; + + while (sz < requested_size) { + sz <<= 1; + } + table = (struct reference *)vmalloc(sz * sizeof(struct reference)); + if (table == NULL) + return -ENOMEM; + + write_lock_bh(&reftbl_lock); + index_mask = sz - 1; + for (i = sz - 1; i >= 0; i--) { + table[i].object = 0; + table[i].lock = SPIN_LOCK_UNLOCKED; + table[i].data.next_plus_upper = (start & ~index_mask) + i - 1; + } + ref_table.entries = table; + ref_table.index_mask = index_mask; + ref_table.first_free = sz - 1; + ref_table.last_free = 1; + write_unlock_bh(&reftbl_lock); + return TIPC_OK; +} + +/** + * ref_table_stop - destroy reference table for objects + */ + +void ref_table_stop(void) +{ + if (!ref_table.entries) + return; + + vfree(ref_table.entries); + ref_table.entries = 0; +} + +/** + * ref_acquire - create reference to an object + * + * Return a unique reference value which can be translated back to the pointer + * 'object' at a later time. Also, pass back a pointer to the lock protecting + * the object, but without locking it. + */ + +u32 ref_acquire(void *object, spinlock_t **lock) +{ + struct reference *entry; + u32 index; + u32 index_mask; + u32 next_plus_upper; + u32 reference = 0; + + assert(ref_table.entries && object); + + write_lock_bh(&reftbl_lock); + if (ref_table.first_free) { + index = ref_table.first_free; + entry = &(ref_table.entries[index]); + index_mask = ref_table.index_mask; + /* take lock in case a previous user of entry still holds it */ + spin_lock_bh(&entry->lock); + next_plus_upper = entry->data.next_plus_upper; + ref_table.first_free = next_plus_upper & index_mask; + reference = (next_plus_upper & ~index_mask) + index; + entry->data.reference = reference; + entry->object = object; + if (lock != 0) + *lock = &entry->lock; + spin_unlock_bh(&entry->lock); + } + write_unlock_bh(&reftbl_lock); + return reference; +} + +/** + * ref_discard - invalidate references to an object + * + * Disallow future references to an object and free up the entry for re-use. + * Note: The entry's spin_lock may still be busy after discard + */ + +void ref_discard(u32 ref) +{ + struct reference *entry; + u32 index; + u32 index_mask; + + assert(ref_table.entries); + assert(ref != 0); + + write_lock_bh(&reftbl_lock); + index_mask = ref_table.index_mask; + index = ref & index_mask; + entry = &(ref_table.entries[index]); + assert(entry->object != 0); + assert(entry->data.reference == ref); + + /* mark entry as unused */ + entry->object = 0; + if (ref_table.first_free == 0) + ref_table.first_free = index; + else + /* next_plus_upper is always XXXX|0--0 for last free entry */ + ref_table.entries[ref_table.last_free].data.next_plus_upper + |= index; + ref_table.last_free = index; + + /* increment upper bits of entry to invalidate subsequent references */ + entry->data.next_plus_upper = (ref & ~index_mask) + (index_mask + 1); + write_unlock_bh(&reftbl_lock); +} + diff --git a/net/tipc/ref.h b/net/tipc/ref.h new file mode 100644 index 000000000000..429cde57228a --- /dev/null +++ b/net/tipc/ref.h @@ -0,0 +1,131 @@ +/* + * net/tipc/ref.h: Include file for TIPC object registry code + * + * Copyright (c) 1991-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_REF_H +#define _TIPC_REF_H + +/** + * struct reference - TIPC object reference entry + * @object: pointer to object associated with reference entry + * @lock: spinlock controlling access to object + * @data: reference value associated with object (or link to next unused entry) + */ + +struct reference { + void *object; + spinlock_t lock; + union { + u32 next_plus_upper; + u32 reference; + } data; +}; + +/** + * struct ref_table - table of TIPC object reference entries + * @entries: pointer to array of reference entries + * @index_mask: bitmask for array index portion of reference values + * @first_free: array index of first unused object reference entry + * @last_free: array index of last unused object reference entry + */ + +struct ref_table { + struct reference *entries; + u32 index_mask; + u32 first_free; + u32 last_free; +}; + +extern struct ref_table ref_table; + +int ref_table_init(u32 requested_size, u32 start); +void ref_table_stop(void); + +u32 ref_acquire(void *object, spinlock_t **lock); +void ref_discard(u32 ref); + + +/** + * ref_lock - lock referenced object and return pointer to it + */ + +static inline void *ref_lock(u32 ref) +{ + if (likely(ref_table.entries)) { + struct reference *r = + &ref_table.entries[ref & ref_table.index_mask]; + + spin_lock_bh(&r->lock); + if (likely(r->data.reference == ref)) + return r->object; + spin_unlock_bh(&r->lock); + } + return 0; +} + +/** + * ref_unlock - unlock referenced object + */ + +static inline void ref_unlock(u32 ref) +{ + if (likely(ref_table.entries)) { + struct reference *r = + &ref_table.entries[ref & ref_table.index_mask]; + + if (likely(r->data.reference == ref)) + spin_unlock_bh(&r->lock); + else + err("ref_unlock() invoked using obsolete reference\n"); + } +} + +/** + * ref_deref - return pointer referenced object (without locking it) + */ + +static inline void *ref_deref(u32 ref) +{ + if (likely(ref_table.entries)) { + struct reference *r = + &ref_table.entries[ref & ref_table.index_mask]; + + if (likely(r->data.reference == ref)) + return r->object; + } + return 0; +} + +#endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c new file mode 100644 index 000000000000..d21f8c0cd25a --- /dev/null +++ b/net/tipc/socket.c @@ -0,0 +1,1726 @@ +/* + * net/tipc/socket.c: TIPC socket API + * + * Copyright (c) 2001-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/net.h> +#include <linux/socket.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/poll.h> +#include <linux/version.h> +#include <linux/fcntl.h> +#include <linux/version.h> +#include <asm/semaphore.h> +#include <asm/string.h> +#include <asm/atomic.h> +#include <net/sock.h> + +#include <linux/tipc.h> +#include <linux/tipc_config.h> +#include <net/tipc/tipc_msg.h> +#include <net/tipc/tipc_port.h> + +#include "core.h" + +#define SS_LISTENING -1 /* socket is listening */ +#define SS_READY -2 /* socket is connectionless */ + +#define OVERLOAD_LIMIT_BASE 5000 + +struct tipc_sock { + struct sock sk; + struct tipc_port *p; + struct semaphore sem; +}; + +#define tipc_sk(sk) ((struct tipc_sock*)sk) + +static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf); +static void wakeupdispatch(struct tipc_port *tport); + +static struct proto_ops packet_ops; +static struct proto_ops stream_ops; +static struct proto_ops msg_ops; + +static struct proto tipc_proto; + +static int sockets_enabled = 0; + +static atomic_t tipc_queue_size = ATOMIC_INIT(0); + + +/* + * sock_lock(): Lock a port/socket pair. lock_sock() can + * not be used here, since the same lock must protect ports + * with non-socket interfaces. + * See net.c for description of locking policy. + */ +static inline void sock_lock(struct tipc_sock* tsock) +{ + spin_lock_bh(tsock->p->lock); +} + +/* + * sock_unlock(): Unlock a port/socket pair + */ +static inline void sock_unlock(struct tipc_sock* tsock) +{ + spin_unlock_bh(tsock->p->lock); +} + +/** + * pollmask - determine the current set of poll() events for a socket + * @sock: socket structure + * + * TIPC sets the returned events as follows: + * a) POLLRDNORM and POLLIN are set if the socket's receive queue is non-empty + * or if a connection-oriented socket is does not have an active connection + * (i.e. a read operation will not block). + * b) POLLOUT is set except when a socket's connection has been terminated + * (i.e. a write operation will not block). + * c) POLLHUP is set when a socket's connection has been terminated. + * + * IMPORTANT: The fact that a read or write operation will not block does NOT + * imply that the operation will succeed! + * + * Returns pollmask value + */ + +static inline u32 pollmask(struct socket *sock) +{ + u32 mask; + + if ((skb_queue_len(&sock->sk->sk_receive_queue) != 0) || + (sock->state == SS_UNCONNECTED) || + (sock->state == SS_DISCONNECTING)) + mask = (POLLRDNORM | POLLIN); + else + mask = 0; + + if (sock->state == SS_DISCONNECTING) + mask |= POLLHUP; + else + mask |= POLLOUT; + + return mask; +} + + +/** + * advance_queue - discard first buffer in queue + * @tsock: TIPC socket + */ + +static inline void advance_queue(struct tipc_sock *tsock) +{ + sock_lock(tsock); + buf_discard(skb_dequeue(&tsock->sk.sk_receive_queue)); + sock_unlock(tsock); + atomic_dec(&tipc_queue_size); +} + +/** + * tipc_create - create a TIPC socket + * @sock: pre-allocated socket structure + * @protocol: protocol indicator (must be 0) + * + * This routine creates and attaches a 'struct sock' to the 'struct socket', + * then create and attaches a TIPC port to the 'struct sock' part. + * + * Returns 0 on success, errno otherwise + */ +static int tipc_create(struct socket *sock, int protocol) +{ + struct tipc_sock *tsock; + struct tipc_port *port; + struct sock *sk; + u32 ref; + + if ((sock->type != SOCK_STREAM) && + (sock->type != SOCK_SEQPACKET) && + (sock->type != SOCK_DGRAM) && + (sock->type != SOCK_RDM)) + return -EPROTOTYPE; + + if (unlikely(protocol != 0)) + return -EPROTONOSUPPORT; + + ref = tipc_createport_raw(0, &dispatch, &wakeupdispatch, TIPC_LOW_IMPORTANCE); + if (unlikely(!ref)) + return -ENOMEM; + + sock->state = SS_UNCONNECTED; + + switch (sock->type) { + case SOCK_STREAM: + sock->ops = &stream_ops; + break; + case SOCK_SEQPACKET: + sock->ops = &packet_ops; + break; + case SOCK_DGRAM: + tipc_set_portunreliable(ref, 1); + /* fall through */ + case SOCK_RDM: + tipc_set_portunreturnable(ref, 1); + sock->ops = &msg_ops; + sock->state = SS_READY; + break; + } + + sk = sk_alloc(AF_TIPC, GFP_KERNEL, &tipc_proto, 1); + if (!sk) { + tipc_deleteport(ref); + return -ENOMEM; + } + + sock_init_data(sock, sk); + init_waitqueue_head(sk->sk_sleep); + sk->sk_rcvtimeo = 8 * HZ; /* default connect timeout = 8s */ + + tsock = tipc_sk(sk); + port = tipc_get_port(ref); + + tsock->p = port; + port->usr_handle = tsock; + + init_MUTEX(&tsock->sem); + + dbg("sock_create: %x\n",tsock); + + atomic_inc(&tipc_user_count); + + return 0; +} + +/** + * release - destroy a TIPC socket + * @sock: socket to destroy + * + * This routine cleans up any messages that are still queued on the socket. + * For DGRAM and RDM socket types, all queued messages are rejected. + * For SEQPACKET and STREAM socket types, the first message is rejected + * and any others are discarded. (If the first message on a STREAM socket + * is partially-read, it is discarded and the next one is rejected instead.) + * + * NOTE: Rejected messages are not necessarily returned to the sender! They + * are returned or discarded according to the "destination droppable" setting + * specified for the message by the sender. + * + * Returns 0 on success, errno otherwise + */ + +static int release(struct socket *sock) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + struct sock *sk = sock->sk; + int res = TIPC_OK; + struct sk_buff *buf; + + dbg("sock_delete: %x\n",tsock); + if (!tsock) + return 0; + down_interruptible(&tsock->sem); + if (!sock->sk) { + up(&tsock->sem); + return 0; + } + + /* Reject unreceived messages, unless no longer connected */ + + while (sock->state != SS_DISCONNECTING) { + sock_lock(tsock); + buf = skb_dequeue(&sk->sk_receive_queue); + if (!buf) + tsock->p->usr_handle = 0; + sock_unlock(tsock); + if (!buf) + break; + if (TIPC_SKB_CB(buf)->handle != msg_data(buf_msg(buf))) + buf_discard(buf); + else + tipc_reject_msg(buf, TIPC_ERR_NO_PORT); + atomic_dec(&tipc_queue_size); + } + + /* Delete TIPC port */ + + res = tipc_deleteport(tsock->p->ref); + sock->sk = NULL; + + /* Discard any remaining messages */ + + while ((buf = skb_dequeue(&sk->sk_receive_queue))) { + buf_discard(buf); + atomic_dec(&tipc_queue_size); + } + + up(&tsock->sem); + + sock_put(sk); + + atomic_dec(&tipc_user_count); + return res; +} + +/** + * bind - associate or disassocate TIPC name(s) with a socket + * @sock: socket structure + * @uaddr: socket address describing name(s) and desired operation + * @uaddr_len: size of socket address data structure + * + * Name and name sequence binding is indicated using a positive scope value; + * a negative scope value unbinds the specified name. Specifying no name + * (i.e. a socket address length of 0) unbinds all names from the socket. + * + * Returns 0 on success, errno otherwise + */ + +static int bind(struct socket *sock, struct sockaddr *uaddr, int uaddr_len) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; + int res; + + if (down_interruptible(&tsock->sem)) + return -ERESTARTSYS; + + if (unlikely(!uaddr_len)) { + res = tipc_withdraw(tsock->p->ref, 0, 0); + goto exit; + } + + if (uaddr_len < sizeof(struct sockaddr_tipc)) { + res = -EINVAL; + goto exit; + } + + if (addr->family != AF_TIPC) { + res = -EAFNOSUPPORT; + goto exit; + } + if (addr->addrtype == TIPC_ADDR_NAME) + addr->addr.nameseq.upper = addr->addr.nameseq.lower; + else if (addr->addrtype != TIPC_ADDR_NAMESEQ) { + res = -EAFNOSUPPORT; + goto exit; + } + + if (addr->scope > 0) + res = tipc_publish(tsock->p->ref, addr->scope, + &addr->addr.nameseq); + else + res = tipc_withdraw(tsock->p->ref, -addr->scope, + &addr->addr.nameseq); +exit: + up(&tsock->sem); + return res; +} + +/** + * get_name - get port ID of socket or peer socket + * @sock: socket structure + * @uaddr: area for returned socket address + * @uaddr_len: area for returned length of socket address + * @peer: 0 to obtain socket name, 1 to obtain peer socket name + * + * Returns 0 on success, errno otherwise + */ + +static int get_name(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; + u32 res; + + if (down_interruptible(&tsock->sem)) + return -ERESTARTSYS; + + *uaddr_len = sizeof(*addr); + addr->addrtype = TIPC_ADDR_ID; + addr->family = AF_TIPC; + addr->scope = 0; + if (peer) + res = tipc_peer(tsock->p->ref, &addr->addr.id); + else + res = tipc_ownidentity(tsock->p->ref, &addr->addr.id); + addr->addr.name.domain = 0; + + up(&tsock->sem); + return res; +} + +/** + * poll - read and possibly block on pollmask + * @file: file structure associated with the socket + * @sock: socket for which to calculate the poll bits + * @wait: ??? + * + * Returns the pollmask + */ + +static unsigned int poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + poll_wait(file, sock->sk->sk_sleep, wait); + /* NEED LOCK HERE? */ + return pollmask(sock); +} + +/** + * dest_name_check - verify user is permitted to send to specified port name + * @dest: destination address + * @m: descriptor for message to be sent + * + * Prevents restricted configuration commands from being issued by + * unauthorized users. + * + * Returns 0 if permission is granted, otherwise errno + */ + +static inline int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m) +{ + struct tipc_cfg_msg_hdr hdr; + + if (likely(dest->addr.name.name.type >= TIPC_RESERVED_TYPES)) + return 0; + if (likely(dest->addr.name.name.type == TIPC_TOP_SRV)) + return 0; + + if (likely(dest->addr.name.name.type != TIPC_CFG_SRV)) + return -EACCES; + + if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr))) + return -EFAULT; + if ((ntohs(hdr.tcm_type) & 0xC000) & (!capable(CAP_NET_ADMIN))) + return -EACCES; + + return 0; +} + +/** + * send_msg - send message in connectionless manner + * @iocb: (unused) + * @sock: socket structure + * @m: message to send + * @total_len: (unused) + * + * Message must have an destination specified explicitly. + * Used for SOCK_RDM and SOCK_DGRAM messages, + * and for 'SYN' messages on SOCK_SEQPACKET and SOCK_STREAM connections. + * (Note: 'SYN+' is prohibited on SOCK_STREAM.) + * + * Returns the number of bytes sent on success, or errno otherwise + */ + +static int send_msg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + struct sockaddr_tipc *dest = (struct sockaddr_tipc *)m->msg_name; + struct sk_buff *buf; + int needs_conn; + int res = -EINVAL; + + if (unlikely(!dest)) + return -EDESTADDRREQ; + if (unlikely(dest->family != AF_TIPC)) + return -EINVAL; + + needs_conn = (sock->state != SS_READY); + if (unlikely(needs_conn)) { + if (sock->state == SS_LISTENING) + return -EPIPE; + if (sock->state != SS_UNCONNECTED) + return -EISCONN; + if ((tsock->p->published) || + ((sock->type == SOCK_STREAM) && (total_len != 0))) + return -EOPNOTSUPP; + } + + if (down_interruptible(&tsock->sem)) + return -ERESTARTSYS; + + if (needs_conn) { + + /* Abort any pending connection attempts (very unlikely) */ + + while ((buf = skb_dequeue(&sock->sk->sk_receive_queue))) { + tipc_reject_msg(buf, TIPC_ERR_NO_PORT); + atomic_dec(&tipc_queue_size); + } + + sock->state = SS_CONNECTING; + } + + do { + if (dest->addrtype == TIPC_ADDR_NAME) { + if ((res = dest_name_check(dest, m))) + goto exit; + res = tipc_send2name(tsock->p->ref, + &dest->addr.name.name, + dest->addr.name.domain, + m->msg_iovlen, + m->msg_iov); + } + else if (dest->addrtype == TIPC_ADDR_ID) { + res = tipc_send2port(tsock->p->ref, + &dest->addr.id, + m->msg_iovlen, + m->msg_iov); + } + else if (dest->addrtype == TIPC_ADDR_MCAST) { + if (needs_conn) { + res = -EOPNOTSUPP; + goto exit; + } + if ((res = dest_name_check(dest, m))) + goto exit; + res = tipc_multicast(tsock->p->ref, + &dest->addr.nameseq, + 0, + m->msg_iovlen, + m->msg_iov); + } + if (likely(res != -ELINKCONG)) { +exit: + up(&tsock->sem); + return res; + } + if (m->msg_flags & MSG_DONTWAIT) { + res = -EWOULDBLOCK; + goto exit; + } + if (wait_event_interruptible(*sock->sk->sk_sleep, + !tsock->p->congested)) { + res = -ERESTARTSYS; + goto exit; + } + } while (1); +} + +/** + * send_packet - send a connection-oriented message + * @iocb: (unused) + * @sock: socket structure + * @m: message to send + * @total_len: (unused) + * + * Used for SOCK_SEQPACKET messages and SOCK_STREAM data. + * + * Returns the number of bytes sent on success, or errno otherwise + */ + +static int send_packet(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + struct sockaddr_tipc *dest = (struct sockaddr_tipc *)m->msg_name; + int res; + + /* Handle implied connection establishment */ + + if (unlikely(dest)) + return send_msg(iocb, sock, m, total_len); + + if (down_interruptible(&tsock->sem)) { + return -ERESTARTSYS; + } + + if (unlikely(sock->state != SS_CONNECTED)) { + if (sock->state == SS_DISCONNECTING) + res = -EPIPE; + else + res = -ENOTCONN; + goto exit; + } + + do { + res = tipc_send(tsock->p->ref, m->msg_iovlen, m->msg_iov); + if (likely(res != -ELINKCONG)) { +exit: + up(&tsock->sem); + return res; + } + if (m->msg_flags & MSG_DONTWAIT) { + res = -EWOULDBLOCK; + goto exit; + } + if (wait_event_interruptible(*sock->sk->sk_sleep, + !tsock->p->congested)) { + res = -ERESTARTSYS; + goto exit; + } + } while (1); +} + +/** + * send_stream - send stream-oriented data + * @iocb: (unused) + * @sock: socket structure + * @m: data to send + * @total_len: total length of data to be sent + * + * Used for SOCK_STREAM data. + * + * Returns the number of bytes sent on success, or errno otherwise + */ + + +static int send_stream(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t total_len) +{ + struct msghdr my_msg; + struct iovec my_iov; + struct iovec *curr_iov; + int curr_iovlen; + char __user *curr_start; + int curr_left; + int bytes_to_send; + int res; + + if (likely(total_len <= TIPC_MAX_USER_MSG_SIZE)) + return send_packet(iocb, sock, m, total_len); + + /* Can only send large data streams if already connected */ + + if (unlikely(sock->state != SS_CONNECTED)) { + if (sock->state == SS_DISCONNECTING) + return -EPIPE; + else + return -ENOTCONN; + } + + /* + * Send each iovec entry using one or more messages + * + * Note: This algorithm is good for the most likely case + * (i.e. one large iovec entry), but could be improved to pass sets + * of small iovec entries into send_packet(). + */ + + my_msg = *m; + curr_iov = my_msg.msg_iov; + curr_iovlen = my_msg.msg_iovlen; + my_msg.msg_iov = &my_iov; + my_msg.msg_iovlen = 1; + + while (curr_iovlen--) { + curr_start = curr_iov->iov_base; + curr_left = curr_iov->iov_len; + + while (curr_left) { + bytes_to_send = (curr_left < TIPC_MAX_USER_MSG_SIZE) + ? curr_left : TIPC_MAX_USER_MSG_SIZE; + my_iov.iov_base = curr_start; + my_iov.iov_len = bytes_to_send; + if ((res = send_packet(iocb, sock, &my_msg, 0)) < 0) + return res; + curr_left -= bytes_to_send; + curr_start += bytes_to_send; + } + + curr_iov++; + } + + return total_len; +} + +/** + * auto_connect - complete connection setup to a remote port + * @sock: socket structure + * @tsock: TIPC-specific socket structure + * @msg: peer's response message + * + * Returns 0 on success, errno otherwise + */ + +static int auto_connect(struct socket *sock, struct tipc_sock *tsock, + struct tipc_msg *msg) +{ + struct tipc_portid peer; + + if (msg_errcode(msg)) { + sock->state = SS_DISCONNECTING; + return -ECONNREFUSED; + } + + peer.ref = msg_origport(msg); + peer.node = msg_orignode(msg); + tipc_connect2port(tsock->p->ref, &peer); + tipc_set_portimportance(tsock->p->ref, msg_importance(msg)); + sock->state = SS_CONNECTED; + return 0; +} + +/** + * set_orig_addr - capture sender's address for received message + * @m: descriptor for message info + * @msg: received message header + * + * Note: Address is not captured if not requested by receiver. + */ + +static inline void set_orig_addr(struct msghdr *m, struct tipc_msg *msg) +{ + struct sockaddr_tipc *addr = (struct sockaddr_tipc *)m->msg_name; + + if (addr) { + addr->family = AF_TIPC; + addr->addrtype = TIPC_ADDR_ID; + addr->addr.id.ref = msg_origport(msg); + addr->addr.id.node = msg_orignode(msg); + addr->addr.name.domain = 0; /* could leave uninitialized */ + addr->scope = 0; /* could leave uninitialized */ + m->msg_namelen = sizeof(struct sockaddr_tipc); + } +} + +/** + * anc_data_recv - optionally capture ancillary data for received message + * @m: descriptor for message info + * @msg: received message header + * @tport: TIPC port associated with message + * + * Note: Ancillary data is not captured if not requested by receiver. + * + * Returns 0 if successful, otherwise errno + */ + +static inline int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, + struct tipc_port *tport) +{ + u32 anc_data[3]; + u32 err; + u32 dest_type; + int res; + + if (likely(m->msg_controllen == 0)) + return 0; + + /* Optionally capture errored message object(s) */ + + err = msg ? msg_errcode(msg) : 0; + if (unlikely(err)) { + anc_data[0] = err; + anc_data[1] = msg_data_sz(msg); + if ((res = put_cmsg(m, SOL_SOCKET, TIPC_ERRINFO, 8, anc_data))) + return res; + if (anc_data[1] && + (res = put_cmsg(m, SOL_SOCKET, TIPC_RETDATA, anc_data[1], + msg_data(msg)))) + return res; + } + + /* Optionally capture message destination object */ + + dest_type = msg ? msg_type(msg) : TIPC_DIRECT_MSG; + switch (dest_type) { + case TIPC_NAMED_MSG: + anc_data[0] = msg_nametype(msg); + anc_data[1] = msg_namelower(msg); + anc_data[2] = msg_namelower(msg); + break; + case TIPC_MCAST_MSG: + anc_data[0] = msg_nametype(msg); + anc_data[1] = msg_namelower(msg); + anc_data[2] = msg_nameupper(msg); + break; + case TIPC_CONN_MSG: + anc_data[0] = tport->conn_type; + anc_data[1] = tport->conn_instance; + anc_data[2] = tport->conn_instance; + break; + default: + anc_data[0] = 0; + } + if (anc_data[0] && + (res = put_cmsg(m, SOL_SOCKET, TIPC_DESTNAME, 12, anc_data))) + return res; + + return 0; +} + +/** + * recv_msg - receive packet-oriented message + * @iocb: (unused) + * @m: descriptor for message info + * @buf_len: total size of user buffer area + * @flags: receive flags + * + * Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages. + * If the complete message doesn't fit in user area, truncate it. + * + * Returns size of returned message data, errno otherwise + */ + +static int recv_msg(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t buf_len, int flags) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + struct sk_buff *buf; + struct tipc_msg *msg; + unsigned int q_len; + unsigned int sz; + u32 err; + int res; + + /* Currently doesn't support receiving into multiple iovec entries */ + + if (m->msg_iovlen != 1) + return -EOPNOTSUPP; + + /* Catch invalid receive attempts */ + + if (unlikely(!buf_len)) + return -EINVAL; + + if (sock->type == SOCK_SEQPACKET) { + if (unlikely(sock->state == SS_UNCONNECTED)) + return -ENOTCONN; + if (unlikely((sock->state == SS_DISCONNECTING) && + (skb_queue_len(&sock->sk->sk_receive_queue) == 0))) + return -ENOTCONN; + } + + /* Look for a message in receive queue; wait if necessary */ + + if (unlikely(down_interruptible(&tsock->sem))) + return -ERESTARTSYS; + +restart: + if (unlikely((skb_queue_len(&sock->sk->sk_receive_queue) == 0) && + (flags & MSG_DONTWAIT))) { + res = -EWOULDBLOCK; + goto exit; + } + + if ((res = wait_event_interruptible( + *sock->sk->sk_sleep, + ((q_len = skb_queue_len(&sock->sk->sk_receive_queue)) || + (sock->state == SS_DISCONNECTING))) )) { + goto exit; + } + + /* Catch attempt to receive on an already terminated connection */ + /* [THIS CHECK MAY OVERLAP WITH AN EARLIER CHECK] */ + + if (!q_len) { + res = -ENOTCONN; + goto exit; + } + + /* Get access to first message in receive queue */ + + buf = skb_peek(&sock->sk->sk_receive_queue); + msg = buf_msg(buf); + sz = msg_data_sz(msg); + err = msg_errcode(msg); + + /* Complete connection setup for an implied connect */ + + if (unlikely(sock->state == SS_CONNECTING)) { + if ((res = auto_connect(sock, tsock, msg))) + goto exit; + } + + /* Discard an empty non-errored message & try again */ + + if ((!sz) && (!err)) { + advance_queue(tsock); + goto restart; + } + + /* Capture sender's address (optional) */ + + set_orig_addr(m, msg); + + /* Capture ancillary data (optional) */ + + if ((res = anc_data_recv(m, msg, tsock->p))) + goto exit; + + /* Capture message data (if valid) & compute return value (always) */ + + if (!err) { + if (unlikely(buf_len < sz)) { + sz = buf_len; + m->msg_flags |= MSG_TRUNC; + } + if (unlikely(copy_to_user(m->msg_iov->iov_base, msg_data(msg), + sz))) { + res = -EFAULT; + goto exit; + } + res = sz; + } else { + if ((sock->state == SS_READY) || + ((err == TIPC_CONN_SHUTDOWN) || m->msg_control)) + res = 0; + else + res = -ECONNRESET; + } + + /* Consume received message (optional) */ + + if (likely(!(flags & MSG_PEEK))) { + if (unlikely(++tsock->p->conn_unacked >= TIPC_FLOW_CONTROL_WIN)) + tipc_acknowledge(tsock->p->ref, tsock->p->conn_unacked); + advance_queue(tsock); + } +exit: + up(&tsock->sem); + return res; +} + +/** + * recv_stream - receive stream-oriented data + * @iocb: (unused) + * @m: descriptor for message info + * @buf_len: total size of user buffer area + * @flags: receive flags + * + * Used for SOCK_STREAM messages only. If not enough data is available + * will optionally wait for more; never truncates data. + * + * Returns size of returned message data, errno otherwise + */ + +static int recv_stream(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t buf_len, int flags) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + struct sk_buff *buf; + struct tipc_msg *msg; + unsigned int q_len; + unsigned int sz; + int sz_to_copy; + int sz_copied = 0; + int needed; + char *crs = m->msg_iov->iov_base; + unsigned char *buf_crs; + u32 err; + int res; + + /* Currently doesn't support receiving into multiple iovec entries */ + + if (m->msg_iovlen != 1) + return -EOPNOTSUPP; + + /* Catch invalid receive attempts */ + + if (unlikely(!buf_len)) + return -EINVAL; + + if (unlikely(sock->state == SS_DISCONNECTING)) { + if (skb_queue_len(&sock->sk->sk_receive_queue) == 0) + return -ENOTCONN; + } else if (unlikely(sock->state != SS_CONNECTED)) + return -ENOTCONN; + + /* Look for a message in receive queue; wait if necessary */ + + if (unlikely(down_interruptible(&tsock->sem))) + return -ERESTARTSYS; + +restart: + if (unlikely((skb_queue_len(&sock->sk->sk_receive_queue) == 0) && + (flags & MSG_DONTWAIT))) { + res = (sz_copied == 0) ? -EWOULDBLOCK : 0; + goto exit; + } + + if ((res = wait_event_interruptible( + *sock->sk->sk_sleep, + ((q_len = skb_queue_len(&sock->sk->sk_receive_queue)) || + (sock->state == SS_DISCONNECTING))) )) { + goto exit; + } + + /* Catch attempt to receive on an already terminated connection */ + /* [THIS CHECK MAY OVERLAP WITH AN EARLIER CHECK] */ + + if (!q_len) { + res = -ENOTCONN; + goto exit; + } + + /* Get access to first message in receive queue */ + + buf = skb_peek(&sock->sk->sk_receive_queue); + msg = buf_msg(buf); + sz = msg_data_sz(msg); + err = msg_errcode(msg); + + /* Discard an empty non-errored message & try again */ + + if ((!sz) && (!err)) { + advance_queue(tsock); + goto restart; + } + + /* Optionally capture sender's address & ancillary data of first msg */ + + if (sz_copied == 0) { + set_orig_addr(m, msg); + if ((res = anc_data_recv(m, msg, tsock->p))) + goto exit; + } + + /* Capture message data (if valid) & compute return value (always) */ + + if (!err) { + buf_crs = (unsigned char *)(TIPC_SKB_CB(buf)->handle); + sz = buf->tail - buf_crs; + + needed = (buf_len - sz_copied); + sz_to_copy = (sz <= needed) ? sz : needed; + if (unlikely(copy_to_user(crs, buf_crs, sz_to_copy))) { + res = -EFAULT; + goto exit; + } + sz_copied += sz_to_copy; + + if (sz_to_copy < sz) { + if (!(flags & MSG_PEEK)) + TIPC_SKB_CB(buf)->handle = buf_crs + sz_to_copy; + goto exit; + } + + crs += sz_to_copy; + } else { + if (sz_copied != 0) + goto exit; /* can't add error msg to valid data */ + + if ((err == TIPC_CONN_SHUTDOWN) || m->msg_control) + res = 0; + else + res = -ECONNRESET; + } + + /* Consume received message (optional) */ + + if (likely(!(flags & MSG_PEEK))) { + if (unlikely(++tsock->p->conn_unacked >= TIPC_FLOW_CONTROL_WIN)) + tipc_acknowledge(tsock->p->ref, tsock->p->conn_unacked); + advance_queue(tsock); + } + + /* Loop around if more data is required */ + + if ((sz_copied < buf_len) /* didn't get all requested data */ + && (flags & MSG_WAITALL) /* ... and need to wait for more */ + && (!(flags & MSG_PEEK)) /* ... and aren't just peeking at data */ + && (!err) /* ... and haven't reached a FIN */ + ) + goto restart; + +exit: + up(&tsock->sem); + return res ? res : sz_copied; +} + +/** + * queue_overloaded - test if queue overload condition exists + * @queue_size: current size of queue + * @base: nominal maximum size of queue + * @msg: message to be added to queue + * + * Returns 1 if queue is currently overloaded, 0 otherwise + */ + +static int queue_overloaded(u32 queue_size, u32 base, struct tipc_msg *msg) +{ + u32 threshold; + u32 imp = msg_importance(msg); + + if (imp == TIPC_LOW_IMPORTANCE) + threshold = base; + else if (imp == TIPC_MEDIUM_IMPORTANCE) + threshold = base * 2; + else if (imp == TIPC_HIGH_IMPORTANCE) + threshold = base * 100; + else + return 0; + + if (msg_connected(msg)) + threshold *= 4; + + return (queue_size > threshold); +} + +/** + * async_disconnect - wrapper function used to disconnect port + * @portref: TIPC port reference (passed as pointer-sized value) + */ + +static void async_disconnect(unsigned long portref) +{ + tipc_disconnect((u32)portref); +} + +/** + * dispatch - handle arriving message + * @tport: TIPC port that received message + * @buf: message + * + * Called with port locked. Must not take socket lock to avoid deadlock risk. + * + * Returns TIPC error status code (TIPC_OK if message is not to be rejected) + */ + +static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + struct tipc_sock *tsock = (struct tipc_sock *)tport->usr_handle; + struct socket *sock; + u32 recv_q_len; + + /* Reject message if socket is closing */ + + if (!tsock) + return TIPC_ERR_NO_PORT; + + /* Reject message if it is wrong sort of message for socket */ + + /* + * WOULD IT BE BETTER TO JUST DISCARD THESE MESSAGES INSTEAD? + * "NO PORT" ISN'T REALLY THE RIGHT ERROR CODE, AND THERE MAY + * BE SECURITY IMPLICATIONS INHERENT IN REJECTING INVALID TRAFFIC + */ + sock = tsock->sk.sk_socket; + if (sock->state == SS_READY) { + if (msg_connected(msg)) { + msg_dbg(msg, "dispatch filter 1\n"); + return TIPC_ERR_NO_PORT; + } + } else { + if (msg_mcast(msg)) { + msg_dbg(msg, "dispatch filter 2\n"); + return TIPC_ERR_NO_PORT; + } + if (sock->state == SS_CONNECTED) { + if (!msg_connected(msg)) { + msg_dbg(msg, "dispatch filter 3\n"); + return TIPC_ERR_NO_PORT; + } + } + else if (sock->state == SS_CONNECTING) { + if (!msg_connected(msg) && (msg_errcode(msg) == 0)) { + msg_dbg(msg, "dispatch filter 4\n"); + return TIPC_ERR_NO_PORT; + } + } + else if (sock->state == SS_LISTENING) { + if (msg_connected(msg) || msg_errcode(msg)) { + msg_dbg(msg, "dispatch filter 5\n"); + return TIPC_ERR_NO_PORT; + } + } + else if (sock->state == SS_DISCONNECTING) { + msg_dbg(msg, "dispatch filter 6\n"); + return TIPC_ERR_NO_PORT; + } + else /* (sock->state == SS_UNCONNECTED) */ { + if (msg_connected(msg) || msg_errcode(msg)) { + msg_dbg(msg, "dispatch filter 7\n"); + return TIPC_ERR_NO_PORT; + } + } + } + + /* Reject message if there isn't room to queue it */ + + if (unlikely((u32)atomic_read(&tipc_queue_size) > + OVERLOAD_LIMIT_BASE)) { + if (queue_overloaded(atomic_read(&tipc_queue_size), + OVERLOAD_LIMIT_BASE, msg)) + return TIPC_ERR_OVERLOAD; + } + recv_q_len = skb_queue_len(&tsock->sk.sk_receive_queue); + if (unlikely(recv_q_len > (OVERLOAD_LIMIT_BASE / 2))) { + if (queue_overloaded(recv_q_len, + OVERLOAD_LIMIT_BASE / 2, msg)) + return TIPC_ERR_OVERLOAD; + } + + /* Initiate connection termination for an incoming 'FIN' */ + + if (unlikely(msg_errcode(msg) && (sock->state == SS_CONNECTED))) { + sock->state = SS_DISCONNECTING; + /* Note: Use signal since port lock is already taken! */ + k_signal((Handler)async_disconnect, tport->ref); + } + + /* Enqueue message (finally!) */ + + msg_dbg(msg,"<DISP<: "); + TIPC_SKB_CB(buf)->handle = msg_data(msg); + atomic_inc(&tipc_queue_size); + skb_queue_tail(&sock->sk->sk_receive_queue, buf); + + wake_up_interruptible(sock->sk->sk_sleep); + return TIPC_OK; +} + +/** + * wakeupdispatch - wake up port after congestion + * @tport: port to wakeup + * + * Called with port lock on. + */ + +static void wakeupdispatch(struct tipc_port *tport) +{ + struct tipc_sock *tsock = (struct tipc_sock *)tport->usr_handle; + + wake_up_interruptible(tsock->sk.sk_sleep); +} + +/** + * connect - establish a connection to another TIPC port + * @sock: socket structure + * @dest: socket address for destination port + * @destlen: size of socket address data structure + * @flags: (unused) + * + * Returns 0 on success, errno otherwise + */ + +static int connect(struct socket *sock, struct sockaddr *dest, int destlen, + int flags) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest; + struct msghdr m = {0,}; + struct sk_buff *buf; + struct tipc_msg *msg; + int res; + + /* For now, TIPC does not allow use of connect() with DGRAM or RDM types */ + + if (sock->state == SS_READY) + return -EOPNOTSUPP; + + /* MOVE THE REST OF THIS ERROR CHECKING TO send_msg()? */ + if (sock->state == SS_LISTENING) + return -EOPNOTSUPP; + if (sock->state == SS_CONNECTING) + return -EALREADY; + if (sock->state != SS_UNCONNECTED) + return -EISCONN; + + if ((dst->family != AF_TIPC) || + ((dst->addrtype != TIPC_ADDR_NAME) && (dst->addrtype != TIPC_ADDR_ID))) + return -EINVAL; + + /* Send a 'SYN-' to destination */ + + m.msg_name = dest; + if ((res = send_msg(0, sock, &m, 0)) < 0) { + sock->state = SS_DISCONNECTING; + return res; + } + + if (down_interruptible(&tsock->sem)) + return -ERESTARTSYS; + + /* Wait for destination's 'ACK' response */ + + res = wait_event_interruptible_timeout(*sock->sk->sk_sleep, + skb_queue_len(&sock->sk->sk_receive_queue), + sock->sk->sk_rcvtimeo); + buf = skb_peek(&sock->sk->sk_receive_queue); + if (res > 0) { + msg = buf_msg(buf); + res = auto_connect(sock, tsock, msg); + if (!res) { + if (dst->addrtype == TIPC_ADDR_NAME) { + tsock->p->conn_type = dst->addr.name.name.type; + tsock->p->conn_instance = dst->addr.name.name.instance; + } + if (!msg_data_sz(msg)) + advance_queue(tsock); + } + } else { + if (res == 0) { + res = -ETIMEDOUT; + } else + { /* leave "res" unchanged */ } + sock->state = SS_DISCONNECTING; + } + + up(&tsock->sem); + return res; +} + +/** + * listen - allow socket to listen for incoming connections + * @sock: socket structure + * @len: (unused) + * + * Returns 0 on success, errno otherwise + */ + +static int listen(struct socket *sock, int len) +{ + /* REQUIRES SOCKET LOCKING OF SOME SORT? */ + + if (sock->state == SS_READY) + return -EOPNOTSUPP; + if (sock->state != SS_UNCONNECTED) + return -EINVAL; + sock->state = SS_LISTENING; + return 0; +} + +/** + * accept - wait for connection request + * @sock: listening socket + * @newsock: new socket that is to be connected + * @flags: file-related flags associated with socket + * + * Returns 0 on success, errno otherwise + */ + +static int accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + struct sk_buff *buf; + int res = -EFAULT; + + if (sock->state == SS_READY) + return -EOPNOTSUPP; + if (sock->state != SS_LISTENING) + return -EINVAL; + + if (unlikely((skb_queue_len(&sock->sk->sk_receive_queue) == 0) && + (flags & O_NONBLOCK))) + return -EWOULDBLOCK; + + if (down_interruptible(&tsock->sem)) + return -ERESTARTSYS; + + if (wait_event_interruptible(*sock->sk->sk_sleep, + skb_queue_len(&sock->sk->sk_receive_queue))) { + res = -ERESTARTSYS; + goto exit; + } + buf = skb_peek(&sock->sk->sk_receive_queue); + + res = tipc_create(newsock, 0); + if (!res) { + struct tipc_sock *new_tsock = tipc_sk(newsock->sk); + struct tipc_portid id; + struct tipc_msg *msg = buf_msg(buf); + u32 new_ref = new_tsock->p->ref; + + id.ref = msg_origport(msg); + id.node = msg_orignode(msg); + tipc_connect2port(new_ref, &id); + newsock->state = SS_CONNECTED; + + tipc_set_portimportance(new_ref, msg_importance(msg)); + if (msg_named(msg)) { + new_tsock->p->conn_type = msg_nametype(msg); + new_tsock->p->conn_instance = msg_nameinst(msg); + } + + /* + * Respond to 'SYN-' by discarding it & returning 'ACK'-. + * Respond to 'SYN+' by queuing it on new socket. + */ + + msg_dbg(msg,"<ACC<: "); + if (!msg_data_sz(msg)) { + struct msghdr m = {0,}; + + send_packet(0, newsock, &m, 0); + advance_queue(tsock); + } else { + sock_lock(tsock); + skb_dequeue(&sock->sk->sk_receive_queue); + sock_unlock(tsock); + skb_queue_head(&newsock->sk->sk_receive_queue, buf); + } + } +exit: + up(&tsock->sem); + return res; +} + +/** + * shutdown - shutdown socket connection + * @sock: socket structure + * @how: direction to close (always treated as read + write) + * + * Terminates connection (if necessary), then purges socket's receive queue. + * + * Returns 0 on success, errno otherwise + */ + +static int shutdown(struct socket *sock, int how) +{ + struct tipc_sock* tsock = tipc_sk(sock->sk); + struct sk_buff *buf; + int res; + + /* Could return -EINVAL for an invalid "how", but why bother? */ + + if (down_interruptible(&tsock->sem)) + return -ERESTARTSYS; + + sock_lock(tsock); + + switch (sock->state) { + case SS_CONNECTED: + + /* Send 'FIN+' or 'FIN-' message to peer */ + + sock_unlock(tsock); +restart: + if ((buf = skb_dequeue(&sock->sk->sk_receive_queue))) { + atomic_dec(&tipc_queue_size); + if (TIPC_SKB_CB(buf)->handle != msg_data(buf_msg(buf))) { + buf_discard(buf); + goto restart; + } + tipc_reject_msg(buf, TIPC_CONN_SHUTDOWN); + } + else { + tipc_shutdown(tsock->p->ref); + } + sock_lock(tsock); + + /* fall through */ + + case SS_DISCONNECTING: + + /* Discard any unreceived messages */ + + while ((buf = skb_dequeue(&sock->sk->sk_receive_queue))) { + atomic_dec(&tipc_queue_size); + buf_discard(buf); + } + tsock->p->conn_unacked = 0; + + /* fall through */ + + case SS_CONNECTING: + sock->state = SS_DISCONNECTING; + res = 0; + break; + + default: + res = -ENOTCONN; + } + + sock_unlock(tsock); + + up(&tsock->sem); + return res; +} + +/** + * setsockopt - set socket option + * @sock: socket structure + * @lvl: option level + * @opt: option identifier + * @ov: pointer to new option value + * @ol: length of option value + * + * For stream sockets only, accepts and ignores all IPPROTO_TCP options + * (to ease compatibility). + * + * Returns 0 on success, errno otherwise + */ + +static int setsockopt(struct socket *sock, int lvl, int opt, char *ov, int ol) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + u32 value; + int res; + + if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM)) + return 0; + if (lvl != SOL_TIPC) + return -ENOPROTOOPT; + if (ol < sizeof(value)) + return -EINVAL; + if ((res = get_user(value, (u32 *)ov))) + return res; + + if (down_interruptible(&tsock->sem)) + return -ERESTARTSYS; + + switch (opt) { + case TIPC_IMPORTANCE: + res = tipc_set_portimportance(tsock->p->ref, value); + break; + case TIPC_SRC_DROPPABLE: + if (sock->type != SOCK_STREAM) + res = tipc_set_portunreliable(tsock->p->ref, value); + else + res = -ENOPROTOOPT; + break; + case TIPC_DEST_DROPPABLE: + res = tipc_set_portunreturnable(tsock->p->ref, value); + break; + case TIPC_CONN_TIMEOUT: + sock->sk->sk_rcvtimeo = (value * HZ / 1000); + break; + default: + res = -EINVAL; + } + + up(&tsock->sem); + return res; +} + +/** + * getsockopt - get socket option + * @sock: socket structure + * @lvl: option level + * @opt: option identifier + * @ov: receptacle for option value + * @ol: receptacle for length of option value + * + * For stream sockets only, returns 0 length result for all IPPROTO_TCP options + * (to ease compatibility). + * + * Returns 0 on success, errno otherwise + */ + +static int getsockopt(struct socket *sock, int lvl, int opt, char *ov, int *ol) +{ + struct tipc_sock *tsock = tipc_sk(sock->sk); + int len; + u32 value; + int res; + + if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM)) + return put_user(0, ol); + if (lvl != SOL_TIPC) + return -ENOPROTOOPT; + if ((res = get_user(len, ol))) + return res; + + if (down_interruptible(&tsock->sem)) + return -ERESTARTSYS; + + switch (opt) { + case TIPC_IMPORTANCE: + res = tipc_portimportance(tsock->p->ref, &value); + break; + case TIPC_SRC_DROPPABLE: + res = tipc_portunreliable(tsock->p->ref, &value); + break; + case TIPC_DEST_DROPPABLE: + res = tipc_portunreturnable(tsock->p->ref, &value); + break; + case TIPC_CONN_TIMEOUT: + value = (sock->sk->sk_rcvtimeo * 1000) / HZ; + break; + default: + res = -EINVAL; + } + + if (res) { + /* "get" failed */ + } + else if (len < sizeof(value)) { + res = -EINVAL; + } + else if ((res = copy_to_user(ov, &value, sizeof(value)))) { + /* couldn't return value */ + } + else { + res = put_user(sizeof(value), ol); + } + + up(&tsock->sem); + return res; +} + +/** + * Placeholders for non-implemented functionality + * + * Returns error code (POSIX-compliant where defined) + */ + +static int ioctl(struct socket *s, u32 cmd, unsigned long arg) +{ + return -EINVAL; +} + +static int no_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + return -EINVAL; +} +static ssize_t no_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int flags) +{ + return -EINVAL; +} + +static int no_skpair(struct socket *s1, struct socket *s2) +{ + return -EOPNOTSUPP; +} + +/** + * Protocol switches for the various types of TIPC sockets + */ + +static struct proto_ops msg_ops = { + .owner = THIS_MODULE, + .family = AF_TIPC, + .release = release, + .bind = bind, + .connect = connect, + .socketpair = no_skpair, + .accept = accept, + .getname = get_name, + .poll = poll, + .ioctl = ioctl, + .listen = listen, + .shutdown = shutdown, + .setsockopt = setsockopt, + .getsockopt = getsockopt, + .sendmsg = send_msg, + .recvmsg = recv_msg, + .mmap = no_mmap, + .sendpage = no_sendpage +}; + +static struct proto_ops packet_ops = { + .owner = THIS_MODULE, + .family = AF_TIPC, + .release = release, + .bind = bind, + .connect = connect, + .socketpair = no_skpair, + .accept = accept, + .getname = get_name, + .poll = poll, + .ioctl = ioctl, + .listen = listen, + .shutdown = shutdown, + .setsockopt = setsockopt, + .getsockopt = getsockopt, + .sendmsg = send_packet, + .recvmsg = recv_msg, + .mmap = no_mmap, + .sendpage = no_sendpage +}; + +static struct proto_ops stream_ops = { + .owner = THIS_MODULE, + .family = AF_TIPC, + .release = release, + .bind = bind, + .connect = connect, + .socketpair = no_skpair, + .accept = accept, + .getname = get_name, + .poll = poll, + .ioctl = ioctl, + .listen = listen, + .shutdown = shutdown, + .setsockopt = setsockopt, + .getsockopt = getsockopt, + .sendmsg = send_stream, + .recvmsg = recv_stream, + .mmap = no_mmap, + .sendpage = no_sendpage +}; + +static struct net_proto_family tipc_family_ops = { + .owner = THIS_MODULE, + .family = AF_TIPC, + .create = tipc_create +}; + +static struct proto tipc_proto = { + .name = "TIPC", + .owner = THIS_MODULE, + .obj_size = sizeof(struct tipc_sock) +}; + +/** + * socket_init - initialize TIPC socket interface + * + * Returns 0 on success, errno otherwise + */ +int socket_init(void) +{ + int res; + + res = proto_register(&tipc_proto, 1); + if (res) { + err("Failed to register TIPC protocol type\n"); + goto out; + } + + res = sock_register(&tipc_family_ops); + if (res) { + err("Failed to register TIPC socket type\n"); + proto_unregister(&tipc_proto); + goto out; + } + + sockets_enabled = 1; + out: + return res; +} + +/** + * sock_stop - stop TIPC socket interface + */ +void socket_stop(void) +{ + if (!sockets_enabled) + return; + + sockets_enabled = 0; + sock_unregister(tipc_family_ops.family); + proto_unregister(&tipc_proto); +} + diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c new file mode 100644 index 000000000000..80e219ba527d --- /dev/null +++ b/net/tipc/subscr.c @@ -0,0 +1,527 @@ +/* + * net/tipc/subscr.c: TIPC subscription service + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "dbg.h" +#include "subscr.h" +#include "name_table.h" +#include "ref.h" + +/** + * struct subscriber - TIPC network topology subscriber + * @ref: object reference to subscriber object itself + * @lock: pointer to spinlock controlling access to subscriber object + * @subscriber_list: adjacent subscribers in top. server's list of subscribers + * @subscription_list: list of subscription objects for this subscriber + * @port_ref: object reference to port used to communicate with subscriber + * @swap: indicates if subscriber uses opposite endianness in its messages + */ + +struct subscriber { + u32 ref; + spinlock_t *lock; + struct list_head subscriber_list; + struct list_head subscription_list; + u32 port_ref; + int swap; +}; + +/** + * struct top_srv - TIPC network topology subscription service + * @user_ref: TIPC userid of subscription service + * @setup_port: reference to TIPC port that handles subscription requests + * @subscription_count: number of active subscriptions (not subscribers!) + * @subscriber_list: list of ports subscribing to service + * @lock: spinlock govering access to subscriber list + */ + +struct top_srv { + u32 user_ref; + u32 setup_port; + atomic_t subscription_count; + struct list_head subscriber_list; + spinlock_t lock; +}; + +static struct top_srv topsrv = { 0 }; + +/** + * htohl - convert value to endianness used by destination + * @in: value to convert + * @swap: non-zero if endianness must be reversed + * + * Returns converted value + */ + +static inline u32 htohl(u32 in, int swap) +{ + char *c = (char *)∈ + + return swap ? ((c[3] << 3) + (c[2] << 2) + (c[1] << 1) + c[0]) : in; +} + +/** + * subscr_send_event - send a message containing a tipc_event to the subscriber + */ + +static void subscr_send_event(struct subscription *sub, + u32 found_lower, + u32 found_upper, + u32 event, + u32 port_ref, + u32 node) +{ + struct iovec msg_sect; + + msg_sect.iov_base = (void *)&sub->evt; + msg_sect.iov_len = sizeof(struct tipc_event); + + sub->evt.event = htohl(event, sub->owner->swap); + sub->evt.found_lower = htohl(found_lower, sub->owner->swap); + sub->evt.found_upper = htohl(found_upper, sub->owner->swap); + sub->evt.port.ref = htohl(port_ref, sub->owner->swap); + sub->evt.port.node = htohl(node, sub->owner->swap); + tipc_send(sub->owner->port_ref, 1, &msg_sect); +} + +/** + * subscr_overlap - test for subscription overlap with the given values + * + * Returns 1 if there is overlap, otherwise 0. + */ + +int subscr_overlap(struct subscription *sub, + u32 found_lower, + u32 found_upper) + +{ + if (found_lower < sub->seq.lower) + found_lower = sub->seq.lower; + if (found_upper > sub->seq.upper) + found_upper = sub->seq.upper; + if (found_lower > found_upper) + return 0; + return 1; +} + +/** + * subscr_report_overlap - issue event if there is subscription overlap + * + * Protected by nameseq.lock in name_table.c + */ + +void subscr_report_overlap(struct subscription *sub, + u32 found_lower, + u32 found_upper, + u32 event, + u32 port_ref, + u32 node, + int must) +{ + dbg("Rep overlap %u:%u,%u<->%u,%u\n", sub->seq.type, sub->seq.lower, + sub->seq.upper, found_lower, found_upper); + if (!subscr_overlap(sub, found_lower, found_upper)) + return; + if (!must && (sub->filter != TIPC_SUB_PORTS)) + return; + subscr_send_event(sub, found_lower, found_upper, event, port_ref, node); +} + +/** + * subscr_timeout - subscription timeout has occurred + */ + +static void subscr_timeout(struct subscription *sub) +{ + struct subscriber *subscriber; + u32 subscriber_ref; + + /* Validate subscriber reference (in case subscriber is terminating) */ + + subscriber_ref = sub->owner->ref; + subscriber = (struct subscriber *)ref_lock(subscriber_ref); + if (subscriber == NULL) + return; + + /* Unlink subscription from name table */ + + nametbl_unsubscribe(sub); + + /* Notify subscriber of timeout, then unlink subscription */ + + subscr_send_event(sub, + sub->evt.s.seq.lower, + sub->evt.s.seq.upper, + TIPC_SUBSCR_TIMEOUT, + 0, + 0); + list_del(&sub->subscription_list); + + /* Now destroy subscription */ + + ref_unlock(subscriber_ref); + k_term_timer(&sub->timer); + kfree(sub); + atomic_dec(&topsrv.subscription_count); +} + +/** + * subscr_terminate - terminate communication with a subscriber + * + * Called with subscriber locked. Routine must temporarily release this lock + * to enable subscription timeout routine(s) to finish without deadlocking; + * the lock is then reclaimed to allow caller to release it upon return. + * (This should work even in the unlikely event some other thread creates + * a new object reference in the interim that uses this lock; this routine will + * simply wait for it to be released, then claim it.) + */ + +static void subscr_terminate(struct subscriber *subscriber) +{ + struct subscription *sub; + struct subscription *sub_temp; + + /* Invalidate subscriber reference */ + + ref_discard(subscriber->ref); + spin_unlock_bh(subscriber->lock); + + /* Destroy any existing subscriptions for subscriber */ + + list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, + subscription_list) { + if (sub->timeout != TIPC_WAIT_FOREVER) { + k_cancel_timer(&sub->timer); + k_term_timer(&sub->timer); + } + nametbl_unsubscribe(sub); + list_del(&sub->subscription_list); + dbg("Term: Removed sub %u,%u,%u from subscriber %x list\n", + sub->seq.type, sub->seq.lower, sub->seq.upper, subscriber); + kfree(sub); + atomic_dec(&topsrv.subscription_count); + } + + /* Sever connection to subscriber */ + + tipc_shutdown(subscriber->port_ref); + tipc_deleteport(subscriber->port_ref); + + /* Remove subscriber from topology server's subscriber list */ + + spin_lock_bh(&topsrv.lock); + list_del(&subscriber->subscriber_list); + spin_unlock_bh(&topsrv.lock); + + /* Now destroy subscriber */ + + spin_lock_bh(subscriber->lock); + kfree(subscriber); +} + +/** + * subscr_subscribe - create subscription for subscriber + * + * Called with subscriber locked + */ + +static void subscr_subscribe(struct tipc_subscr *s, + struct subscriber *subscriber) +{ + struct subscription *sub; + + /* Refuse subscription if global limit exceeded */ + + if (atomic_read(&topsrv.subscription_count) >= tipc_max_subscriptions) { + warn("Failed: max %u subscriptions\n", tipc_max_subscriptions); + subscr_terminate(subscriber); + return; + } + + /* Allocate subscription object */ + + sub = kmalloc(sizeof(*sub), GFP_ATOMIC); + if (sub == NULL) { + warn("Memory squeeze; ignoring subscription\n"); + subscr_terminate(subscriber); + return; + } + + /* Determine/update subscriber's endianness */ + + if ((s->filter == TIPC_SUB_PORTS) || (s->filter == TIPC_SUB_SERVICE)) + subscriber->swap = 0; + else + subscriber->swap = 1; + + /* Initialize subscription object */ + + memset(sub, 0, sizeof(*sub)); + sub->seq.type = htohl(s->seq.type, subscriber->swap); + sub->seq.lower = htohl(s->seq.lower, subscriber->swap); + sub->seq.upper = htohl(s->seq.upper, subscriber->swap); + sub->timeout = htohl(s->timeout, subscriber->swap); + sub->filter = htohl(s->filter, subscriber->swap); + if ((((sub->filter != TIPC_SUB_PORTS) + && (sub->filter != TIPC_SUB_SERVICE))) + || (sub->seq.lower > sub->seq.upper)) { + warn("Rejecting illegal subscription %u,%u,%u\n", + sub->seq.type, sub->seq.lower, sub->seq.upper); + kfree(sub); + subscr_terminate(subscriber); + return; + } + memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr)); + INIT_LIST_HEAD(&sub->subscription_list); + INIT_LIST_HEAD(&sub->nameseq_list); + list_add(&sub->subscription_list, &subscriber->subscription_list); + atomic_inc(&topsrv.subscription_count); + if (sub->timeout != TIPC_WAIT_FOREVER) { + k_init_timer(&sub->timer, + (Handler)subscr_timeout, (unsigned long)sub); + k_start_timer(&sub->timer, sub->timeout); + } + sub->owner = subscriber; + nametbl_subscribe(sub); +} + +/** + * subscr_conn_shutdown_event - handle termination request from subscriber + */ + +static void subscr_conn_shutdown_event(void *usr_handle, + u32 portref, + struct sk_buff **buf, + unsigned char const *data, + unsigned int size, + int reason) +{ + struct subscriber *subscriber; + spinlock_t *subscriber_lock; + + subscriber = ref_lock((u32)(unsigned long)usr_handle); + if (subscriber == NULL) + return; + + subscriber_lock = subscriber->lock; + subscr_terminate(subscriber); + spin_unlock_bh(subscriber_lock); +} + +/** + * subscr_conn_msg_event - handle new subscription request from subscriber + */ + +static void subscr_conn_msg_event(void *usr_handle, + u32 port_ref, + struct sk_buff **buf, + const unchar *data, + u32 size) +{ + struct subscriber *subscriber; + spinlock_t *subscriber_lock; + + subscriber = ref_lock((u32)(unsigned long)usr_handle); + if (subscriber == NULL) + return; + + subscriber_lock = subscriber->lock; + if (size != sizeof(struct tipc_subscr)) + subscr_terminate(subscriber); + else + subscr_subscribe((struct tipc_subscr *)data, subscriber); + + spin_unlock_bh(subscriber_lock); +} + +/** + * subscr_named_msg_event - handle request to establish a new subscriber + */ + +static void subscr_named_msg_event(void *usr_handle, + u32 port_ref, + struct sk_buff **buf, + const unchar *data, + u32 size, + u32 importance, + struct tipc_portid const *orig, + struct tipc_name_seq const *dest) +{ + struct subscriber *subscriber; + struct iovec msg_sect = {0, 0}; + spinlock_t *subscriber_lock; + + dbg("subscr_named_msg_event: orig = %x own = %x,\n", + orig->node, tipc_own_addr); + if (size && (size != sizeof(struct tipc_subscr))) { + warn("Received tipc_subscr of invalid size\n"); + return; + } + + /* Create subscriber object */ + + subscriber = kmalloc(sizeof(struct subscriber), GFP_ATOMIC); + if (subscriber == NULL) { + warn("Memory squeeze; ignoring subscriber setup\n"); + return; + } + memset(subscriber, 0, sizeof(struct subscriber)); + INIT_LIST_HEAD(&subscriber->subscription_list); + INIT_LIST_HEAD(&subscriber->subscriber_list); + subscriber->ref = ref_acquire(subscriber, &subscriber->lock); + if (subscriber->ref == 0) { + warn("Failed to acquire subscriber reference\n"); + kfree(subscriber); + return; + } + + /* Establish a connection to subscriber */ + + tipc_createport(topsrv.user_ref, + (void *)(unsigned long)subscriber->ref, + importance, + 0, + 0, + subscr_conn_shutdown_event, + 0, + 0, + subscr_conn_msg_event, + 0, + &subscriber->port_ref); + if (subscriber->port_ref == 0) { + warn("Memory squeeze; failed to create subscription port\n"); + ref_discard(subscriber->ref); + kfree(subscriber); + return; + } + tipc_connect2port(subscriber->port_ref, orig); + + + /* Add subscriber to topology server's subscriber list */ + + ref_lock(subscriber->ref); + spin_lock_bh(&topsrv.lock); + list_add(&subscriber->subscriber_list, &topsrv.subscriber_list); + spin_unlock_bh(&topsrv.lock); + + /* + * Subscribe now if message contains a subscription, + * otherwise send an empty response to complete connection handshaking + */ + + subscriber_lock = subscriber->lock; + if (size) + subscr_subscribe((struct tipc_subscr *)data, subscriber); + else + tipc_send(subscriber->port_ref, 1, &msg_sect); + + spin_unlock_bh(subscriber_lock); +} + +int subscr_start(void) +{ + struct tipc_name_seq seq = {TIPC_TOP_SRV, TIPC_TOP_SRV, TIPC_TOP_SRV}; + int res = -1; + + memset(&topsrv, 0, sizeof (topsrv)); + topsrv.lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&topsrv.subscriber_list); + + spin_lock_bh(&topsrv.lock); + res = tipc_attach(&topsrv.user_ref, 0, 0); + if (res) { + spin_unlock_bh(&topsrv.lock); + return res; + } + + res = tipc_createport(topsrv.user_ref, + 0, + TIPC_CRITICAL_IMPORTANCE, + 0, + 0, + 0, + 0, + subscr_named_msg_event, + 0, + 0, + &topsrv.setup_port); + if (res) + goto failed; + + res = nametbl_publish_rsv(topsrv.setup_port, TIPC_NODE_SCOPE, &seq); + if (res) + goto failed; + + spin_unlock_bh(&topsrv.lock); + return 0; + +failed: + err("Failed to create subscription service\n"); + tipc_detach(topsrv.user_ref); + topsrv.user_ref = 0; + spin_unlock_bh(&topsrv.lock); + return res; +} + +void subscr_stop(void) +{ + struct subscriber *subscriber; + struct subscriber *subscriber_temp; + spinlock_t *subscriber_lock; + + if (topsrv.user_ref) { + tipc_deleteport(topsrv.setup_port); + list_for_each_entry_safe(subscriber, subscriber_temp, + &topsrv.subscriber_list, + subscriber_list) { + ref_lock(subscriber->ref); + subscriber_lock = subscriber->lock; + subscr_terminate(subscriber); + spin_unlock_bh(subscriber_lock); + } + tipc_detach(topsrv.user_ref); + topsrv.user_ref = 0; + } +} + + +int tipc_ispublished(struct tipc_name const *name) +{ + u32 domain = 0; + + return(nametbl_translate(name->type, name->instance,&domain) != 0); +} + diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h new file mode 100644 index 000000000000..ccff4efcb755 --- /dev/null +++ b/net/tipc/subscr.h @@ -0,0 +1,80 @@ +/* + * net/tipc/subscr.h: Include file for TIPC subscription service + * + * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_SUBSCR_H +#define _TIPC_SUBSCR_H + +/** + * struct subscription - TIPC network topology subscription object + * @seq: name sequence associated with subscription + * @timeout: duration of subscription (in ms) + * @filter: event filtering to be done for subscription + * @evt: template for events generated by subscription + * @subscription_list: adjacent subscriptions in subscriber's subscription list + * @nameseq_list: adjacent subscriptions in name sequence's subscription list + * @timer_ref: reference to timer governing subscription duration (may be NULL) + * @owner: pointer to subscriber object associated with this subscription + */ + +struct subscription { + struct tipc_name_seq seq; + u32 timeout; + u32 filter; + struct tipc_event evt; + struct list_head subscription_list; + struct list_head nameseq_list; + struct timer_list timer; + struct subscriber *owner; +}; + +int subscr_overlap(struct subscription * sub, + u32 found_lower, + u32 found_upper); + +void subscr_report_overlap(struct subscription * sub, + u32 found_lower, + u32 found_upper, + u32 event, + u32 port_ref, + u32 node, + int must_report); + +int subscr_start(void); + +void subscr_stop(void); + + +#endif diff --git a/net/tipc/user_reg.c b/net/tipc/user_reg.c new file mode 100644 index 000000000000..35ec7dc8211d --- /dev/null +++ b/net/tipc/user_reg.c @@ -0,0 +1,265 @@ +/* + * net/tipc/user_reg.c: TIPC user registry code + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2004-2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "user_reg.h" + +/* + * TIPC user registry keeps track of users of the tipc_port interface. + * + * The registry utilizes an array of "TIPC user" entries; + * a user's ID is the index of their associated array entry. + * Array entry 0 is not used, so userid 0 is not valid; + * TIPC sometimes uses this value to denote an anonymous user. + * The list of free entries is initially chained from last entry to entry 1. + */ + +/** + * struct tipc_user - registered TIPC user info + * @next: index of next free registry entry (or -1 for an allocated entry) + * @callback: ptr to routine to call when TIPC mode changes (NULL if none) + * @usr_handle: user-defined value passed to callback routine + * @ports: list of user ports owned by the user + */ + +struct tipc_user { + int next; + tipc_mode_event callback; + void *usr_handle; + struct list_head ports; +}; + +#define MAX_USERID 64 +#define USER_LIST_SIZE ((MAX_USERID + 1) * sizeof(struct tipc_user)) + +static struct tipc_user *users = 0; +static u32 next_free_user = MAX_USERID + 1; +static spinlock_t reg_lock = SPIN_LOCK_UNLOCKED; + +/** + * reg_init - create TIPC user registry (but don't activate it) + * + * If registry has been pre-initialized it is left "as is". + * NOTE: This routine may be called when TIPC is inactive. + */ + +static int reg_init(void) +{ + u32 i; + + spin_lock_bh(®_lock); + if (!users) { + users = (struct tipc_user *)kmalloc(USER_LIST_SIZE, GFP_ATOMIC); + if (users) { + memset(users, 0, USER_LIST_SIZE); + for (i = 1; i <= MAX_USERID; i++) { + users[i].next = i - 1; + } + next_free_user = MAX_USERID; + } + } + spin_unlock_bh(®_lock); + return users ? TIPC_OK : -ENOMEM; +} + +/** + * reg_callback - inform TIPC user about current operating mode + */ + +static void reg_callback(struct tipc_user *user_ptr) +{ + tipc_mode_event cb; + void *arg; + + spin_lock_bh(®_lock); + cb = user_ptr->callback; + arg = user_ptr->usr_handle; + spin_unlock_bh(®_lock); + + if (cb) + cb(arg, tipc_mode, tipc_own_addr); +} + +/** + * reg_start - activate TIPC user registry + */ + +int reg_start(void) +{ + u32 u; + int res; + + if ((res = reg_init())) + return res; + + for (u = 1; u <= MAX_USERID; u++) { + if (users[u].callback) + k_signal((Handler)reg_callback, + (unsigned long)&users[u]); + } + return TIPC_OK; +} + +/** + * reg_stop - shut down & delete TIPC user registry + */ + +void reg_stop(void) +{ + int id; + + if (!users) + return; + + for (id = 1; id <= MAX_USERID; id++) { + if (users[id].callback) + reg_callback(&users[id]); + } + kfree(users); + users = 0; +} + +/** + * tipc_attach - register a TIPC user + * + * NOTE: This routine may be called when TIPC is inactive. + */ + +int tipc_attach(u32 *userid, tipc_mode_event cb, void *usr_handle) +{ + struct tipc_user *user_ptr; + + if ((tipc_mode == TIPC_NOT_RUNNING) && !cb) + return -ENOPROTOOPT; + if (!users) + reg_init(); + + spin_lock_bh(®_lock); + if (!next_free_user) { + spin_unlock_bh(®_lock); + return -EBUSY; + } + user_ptr = &users[next_free_user]; + *userid = next_free_user; + next_free_user = user_ptr->next; + user_ptr->next = -1; + spin_unlock_bh(®_lock); + + user_ptr->callback = cb; + user_ptr->usr_handle = usr_handle; + INIT_LIST_HEAD(&user_ptr->ports); + atomic_inc(&tipc_user_count); + + if (cb && (tipc_mode != TIPC_NOT_RUNNING)) + k_signal((Handler)reg_callback, (unsigned long)user_ptr); + return TIPC_OK; +} + +/** + * tipc_detach - deregister a TIPC user + */ + +void tipc_detach(u32 userid) +{ + struct tipc_user *user_ptr; + struct list_head ports_temp; + struct user_port *up_ptr, *temp_up_ptr; + + if ((userid == 0) || (userid > MAX_USERID)) + return; + + spin_lock_bh(®_lock); + if ((!users) || (users[userid].next >= 0)) { + spin_unlock_bh(®_lock); + return; + } + + user_ptr = &users[userid]; + user_ptr->callback = NULL; + INIT_LIST_HEAD(&ports_temp); + list_splice(&user_ptr->ports, &ports_temp); + user_ptr->next = next_free_user; + next_free_user = userid; + spin_unlock_bh(®_lock); + + atomic_dec(&tipc_user_count); + + list_for_each_entry_safe(up_ptr, temp_up_ptr, &ports_temp, uport_list) { + tipc_deleteport(up_ptr->ref); + } +} + +/** + * reg_add_port - register a user's driver port + */ + +int reg_add_port(struct user_port *up_ptr) +{ + struct tipc_user *user_ptr; + + if (up_ptr->user_ref == 0) + return TIPC_OK; + if (up_ptr->user_ref > MAX_USERID) + return -EINVAL; + if ((tipc_mode == TIPC_NOT_RUNNING) || !users ) + return -ENOPROTOOPT; + + spin_lock_bh(®_lock); + user_ptr = &users[up_ptr->user_ref]; + list_add(&up_ptr->uport_list, &user_ptr->ports); + spin_unlock_bh(®_lock); + return TIPC_OK; +} + +/** + * reg_remove_port - deregister a user's driver port + */ + +int reg_remove_port(struct user_port *up_ptr) +{ + if (up_ptr->user_ref == 0) + return TIPC_OK; + if (up_ptr->user_ref > MAX_USERID) + return -EINVAL; + if (!users ) + return -ENOPROTOOPT; + + spin_lock_bh(®_lock); + list_del_init(&up_ptr->uport_list); + spin_unlock_bh(®_lock); + return TIPC_OK; +} + diff --git a/net/tipc/user_reg.h b/net/tipc/user_reg.h new file mode 100644 index 000000000000..122ca9be3671 --- /dev/null +++ b/net/tipc/user_reg.h @@ -0,0 +1,48 @@ +/* + * net/tipc/user_reg.h: Include file for TIPC user registry code + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_USER_REG_H +#define _TIPC_USER_REG_H + +#include "port.h" + +int reg_start(void); +void reg_stop(void); + +int reg_add_port(struct user_port *up_ptr); +int reg_remove_port(struct user_port *up_ptr); + +#endif diff --git a/net/tipc/zone.c b/net/tipc/zone.c new file mode 100644 index 000000000000..4eaef662d568 --- /dev/null +++ b/net/tipc/zone.c @@ -0,0 +1,169 @@ +/* + * net/tipc/zone.c: TIPC zone management routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "zone.h" +#include "net.h" +#include "addr.h" +#include "node_subscr.h" +#include "cluster.h" +#include "node.h" + +struct _zone *zone_create(u32 addr) +{ + struct _zone *z_ptr = 0; + u32 z_num; + + if (!addr_domain_valid(addr)) + return 0; + + z_ptr = (struct _zone *)kmalloc(sizeof(*z_ptr), GFP_ATOMIC); + if (z_ptr != NULL) { + memset(z_ptr, 0, sizeof(*z_ptr)); + z_num = tipc_zone(addr); + z_ptr->addr = tipc_addr(z_num, 0, 0); + net.zones[z_num] = z_ptr; + } + return z_ptr; +} + +void zone_delete(struct _zone *z_ptr) +{ + u32 c_num; + + if (!z_ptr) + return; + for (c_num = 1; c_num <= tipc_max_clusters; c_num++) { + cluster_delete(z_ptr->clusters[c_num]); + } + kfree(z_ptr); +} + +void zone_attach_cluster(struct _zone *z_ptr, struct cluster *c_ptr) +{ + u32 c_num = tipc_cluster(c_ptr->addr); + + assert(c_ptr->addr); + assert(c_num <= tipc_max_clusters); + assert(z_ptr->clusters[c_num] == 0); + z_ptr->clusters[c_num] = c_ptr; +} + +void zone_remove_as_router(struct _zone *z_ptr, u32 router) +{ + u32 c_num; + + for (c_num = 1; c_num <= tipc_max_clusters; c_num++) { + if (z_ptr->clusters[c_num]) { + cluster_remove_as_router(z_ptr->clusters[c_num], + router); + } + } +} + +void zone_send_external_routes(struct _zone *z_ptr, u32 dest) +{ + u32 c_num; + + for (c_num = 1; c_num <= tipc_max_clusters; c_num++) { + if (z_ptr->clusters[c_num]) { + if (in_own_cluster(z_ptr->addr)) + continue; + cluster_send_ext_routes(z_ptr->clusters[c_num], dest); + } + } +} + +struct node *zone_select_remote_node(struct _zone *z_ptr, u32 addr, u32 ref) +{ + struct cluster *c_ptr; + struct node *n_ptr; + u32 c_num; + + if (!z_ptr) + return 0; + c_ptr = z_ptr->clusters[tipc_cluster(addr)]; + if (!c_ptr) + return 0; + n_ptr = cluster_select_node(c_ptr, ref); + if (n_ptr) + return n_ptr; + + /* Links to any other clusters within this zone ? */ + for (c_num = 1; c_num <= tipc_max_clusters; c_num++) { + c_ptr = z_ptr->clusters[c_num]; + if (!c_ptr) + return 0; + n_ptr = cluster_select_node(c_ptr, ref); + if (n_ptr) + return n_ptr; + } + return 0; +} + +u32 zone_select_router(struct _zone *z_ptr, u32 addr, u32 ref) +{ + struct cluster *c_ptr; + u32 c_num; + u32 router; + + if (!z_ptr) + return 0; + c_ptr = z_ptr->clusters[tipc_cluster(addr)]; + router = c_ptr ? cluster_select_router(c_ptr, ref) : 0; + if (router) + return router; + + /* Links to any other clusters within the zone? */ + for (c_num = 1; c_num <= tipc_max_clusters; c_num++) { + c_ptr = z_ptr->clusters[c_num]; + router = c_ptr ? cluster_select_router(c_ptr, ref) : 0; + if (router) + return router; + } + return 0; +} + + +u32 zone_next_node(u32 addr) +{ + struct cluster *c_ptr = cluster_find(addr); + + if (c_ptr) + return cluster_next_node(c_ptr, addr); + return 0; +} + diff --git a/net/tipc/zone.h b/net/tipc/zone.h new file mode 100644 index 000000000000..4326f78d8292 --- /dev/null +++ b/net/tipc/zone.h @@ -0,0 +1,71 @@ +/* + * net/tipc/zone.h: Include file for TIPC zone management routines + * + * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2005, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_ZONE_H +#define _TIPC_ZONE_H + +#include "node_subscr.h" +#include "net.h" + + +/** + * struct _zone - TIPC zone structure + * @addr: network address of zone + * @clusters: array of pointers to all clusters within zone + * @links: (used for inter-zone communication) + */ + +struct _zone { + u32 addr; + struct cluster *clusters[2]; /* currently limited to just 1 cluster */ + u32 links; +}; + +struct node *zone_select_remote_node(struct _zone *z_ptr, u32 addr, u32 ref); +u32 zone_select_router(struct _zone *z_ptr, u32 addr, u32 ref); +void zone_remove_as_router(struct _zone *z_ptr, u32 router); +void zone_send_external_routes(struct _zone *z_ptr, u32 dest); +struct _zone *zone_create(u32 addr); +void zone_delete(struct _zone *z_ptr); +void zone_attach_cluster(struct _zone *z_ptr, struct cluster *c_ptr); +u32 zone_next_node(u32 addr); + +static inline struct _zone *zone_find(u32 addr) +{ + return net.zones[tipc_zone(addr)]; +} + +#endif diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index acc73ba8bade..1b5989b1b670 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -121,7 +121,7 @@ int sysctl_unix_max_dgram_qlen = 10; struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; -DEFINE_RWLOCK(unix_table_lock); +DEFINE_SPINLOCK(unix_table_lock); static atomic_t unix_nr_socks = ATOMIC_INIT(0); #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -130,7 +130,7 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0); /* * SMP locking strategy: - * hash table is protected with rwlock unix_table_lock + * hash table is protected with spinlock unix_table_lock * each socket state is protected by separate rwlock. */ @@ -214,16 +214,16 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) static inline void unix_remove_socket(struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_remove_socket(sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_insert_socket(list, sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, @@ -250,11 +250,11 @@ static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname, { struct sock *s; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); s = __unix_find_socket_byname(sunname, len, type, hash); if (s) sock_hold(s); - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } @@ -263,7 +263,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) struct sock *s; struct hlist_node *node; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); sk_for_each(s, node, &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->dentry; @@ -276,7 +276,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) } s = NULL; found: - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } @@ -473,7 +473,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *, static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, struct msghdr *, size_t); -static struct proto_ops unix_stream_ops = { +static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -494,7 +494,7 @@ static struct proto_ops unix_stream_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_dgram_ops = { +static const struct proto_ops unix_dgram_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -515,7 +515,7 @@ static struct proto_ops unix_dgram_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_seqpacket_ops = { +static const struct proto_ops unix_seqpacket_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -564,7 +564,7 @@ static struct sock * unix_create1(struct socket *sock) u = unix_sk(sk); u->dentry = NULL; u->mnt = NULL; - rwlock_init(&u->lock); + spin_lock_init(&u->lock); atomic_set(&u->inflight, sock ? 0 : -1); init_MUTEX(&u->readsem); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); @@ -642,12 +642,12 @@ retry: addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0)); - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(addr->name, addr->len, sock->type, addr->hash)) { - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); /* Sanity yield. It is unusual case, but yet... */ if (!(ordernum&0xFF)) yield(); @@ -658,7 +658,7 @@ retry: __unix_remove_socket(sk); u->addr = addr; __unix_insert_socket(&unix_socket_table[addr->hash], sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); err = 0; out: up(&u->readsem); @@ -784,14 +784,14 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0); if (err) goto out_mknod_dput; - up(&nd.dentry->d_inode->i_sem); + mutex_unlock(&nd.dentry->d_inode->i_mutex); dput(nd.dentry); nd.dentry = dentry; addr->hash = UNIX_HASH_SIZE; } - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); if (!sunaddr->sun_path[0]) { err = -EADDRINUSE; @@ -814,7 +814,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) __unix_insert_socket(list, sk); out_unlock: - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); out_up: up(&u->readsem); out: @@ -823,7 +823,7 @@ out: out_mknod_dput: dput(dentry); out_mknod_unlock: - up(&nd.dentry->d_inode->i_sem); + mutex_unlock(&nd.dentry->d_inode->i_mutex); path_release(&nd); out_mknod_parent: if (err==-EEXIST) @@ -1063,10 +1063,12 @@ restart: /* Set credentials */ sk->sk_peercred = other->sk_peercred; - sock_hold(newsk); - unix_peer(sk) = newsk; sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; + sock_hold(newsk); + + smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ + unix_peer(sk) = newsk; unix_state_wunlock(sk); @@ -1414,7 +1416,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, } else { sunaddr = NULL; err = -ENOTCONN; - other = unix_peer_get(sk); + other = unix_peer(sk); if (!other) goto out_err; } @@ -1476,7 +1478,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, other->sk_data_ready(other, size); sent+=size; } - sock_put(other); scm_destroy(siocb->scm); siocb->scm = NULL; @@ -1491,8 +1492,6 @@ pipe_err: send_sig(SIGPIPE,current,0); err = -EPIPE; out_err: - if (other) - sock_put(other); scm_destroy(siocb->scm); siocb->scm = NULL; return sent ? : err; @@ -1860,7 +1859,7 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } default: - err = dev_ioctl(cmd, (void __user *)arg); + err = -ENOIOCTLCMD; break; } return err; @@ -1917,7 +1916,7 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos) static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1); } @@ -1932,7 +1931,7 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void unix_seq_stop(struct seq_file *seq, void *v) { - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static int unix_seq_show(struct seq_file *seq, void *v) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 6ffc64e1712d..411802bd4d37 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -182,7 +182,7 @@ void unix_gc(void) if (down_trylock(&unix_gc_sem)) return; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); forall_unix_sockets(i, s) { @@ -301,7 +301,7 @@ void unix_gc(void) } u->gc_tree = GC_ORPHAN; } - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); /* * Here we are. Hitlist is filled. Die. diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c index 59fec59b2132..8b9bf4a763b5 100644 --- a/net/wanrouter/af_wanpipe.c +++ b/net/wanrouter/af_wanpipe.c @@ -36,6 +36,7 @@ #include <linux/types.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/capability.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> @@ -181,7 +182,7 @@ struct wanpipe_opt #endif static int sk_count; -extern struct proto_ops wanpipe_ops; +extern const struct proto_ops wanpipe_ops; static unsigned long find_free_critical; static void wanpipe_unlink_driver(struct sock *sk); @@ -1839,7 +1840,7 @@ static int wanpipe_ioctl(struct socket *sock, unsigned int cmd, unsigned long ar #endif default: - return dev_ioctl(cmd,(void __user *) arg); + return -ENOIOCTLCMD; } /*NOTREACHED*/ } @@ -2546,7 +2547,7 @@ static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr return 0; } -struct proto_ops wanpipe_ops = { +const struct proto_ops wanpipe_ops = { .family = PF_WANPIPE, .owner = THIS_MODULE, .release = wanpipe_release, diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index bcf7b3faa76a..c34833dc7cc1 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -44,6 +44,7 @@ #include <linux/config.h> #include <linux/stddef.h> /* offsetof(), etc. */ +#include <linux/capability.h> #include <linux/errno.h> /* return codes */ #include <linux/kernel.h> #include <linux/init.h> diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 020d73cc8414..72b6ff3299ba 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -37,6 +37,7 @@ #include <linux/config.h> #include <linux/module.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -64,7 +65,7 @@ int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); -static struct proto_ops x25_proto_ops; +static const struct proto_ops x25_proto_ops; static struct x25_address null_x25_address = {" "}; @@ -540,12 +541,7 @@ static struct sock *x25_make_new(struct sock *osk) sk->sk_state = TCP_ESTABLISHED; sk->sk_sleep = osk->sk_sleep; sk->sk_backlog_rcv = osk->sk_backlog_rcv; - - if (sock_flag(osk, SOCK_ZAPPED)) - sock_set_flag(sk, SOCK_ZAPPED); - - if (sock_flag(osk, SOCK_DBG)) - sock_set_flag(sk, SOCK_DBG); + sock_copy_flags(sk, osk); ox25 = x25_sk(osk); x25->t21 = ox25->t21; @@ -1378,7 +1374,7 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } default: - rc = dev_ioctl(cmd, argp); + rc = -ENOIOCTLCMD; break; } @@ -1391,7 +1387,7 @@ static struct net_proto_family x25_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 2f4531fcaca2..6ed3302312fb 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -540,8 +540,7 @@ void skb_icv_walk(const struct sk_buff *skb, struct crypto_tfm *tfm, start = end; } } - if (len) - BUG(); + BUG_ON(len); } EXPORT_SYMBOL_GPL(skb_icv_walk); @@ -610,8 +609,7 @@ skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) start = end; } } - if (len) - BUG(); + BUG_ON(len); return elt; } EXPORT_SYMBOL_GPL(skb_to_sgvec); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d19e274b9c4a..077bbf9fb9b7 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -10,7 +10,7 @@ * YOSHIFUJI Hideaki * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> Add the post_input processor - * + * */ #include <asm/bug.h> @@ -22,6 +22,7 @@ #include <linux/workqueue.h> #include <linux/notifier.h> #include <linux/netdevice.h> +#include <linux/netfilter.h> #include <linux/module.h> #include <net/xfrm.h> #include <net/ip.h> @@ -247,15 +248,14 @@ EXPORT_SYMBOL(xfrm_policy_alloc); void __xfrm_policy_destroy(struct xfrm_policy *policy) { - if (!policy->dead) - BUG(); + BUG_ON(!policy->dead); - if (policy->bundles) - BUG(); + BUG_ON(policy->bundles); if (del_timer(&policy->timer)) BUG(); + security_xfrm_policy_free(policy); kfree(policy); } EXPORT_SYMBOL(__xfrm_policy_destroy); @@ -350,7 +350,8 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) write_lock_bh(&xfrm_policy_lock); for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) { - if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) { + if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 && + xfrm_sec_ctx_match(pol->security, policy->security)) { if (excl) { write_unlock_bh(&xfrm_policy_lock); return -EEXIST; @@ -416,14 +417,15 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } EXPORT_SYMBOL(xfrm_policy_insert); -struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, - int delete) +struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx, int delete) { struct xfrm_policy *pol, **p; write_lock_bh(&xfrm_policy_lock); for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) { - if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) { + if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) && + (xfrm_sec_ctx_match(ctx, pol->security))) { xfrm_pol_hold(pol); if (delete) *p = pol->next; @@ -438,7 +440,7 @@ struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, } return pol; } -EXPORT_SYMBOL(xfrm_policy_bysel); +EXPORT_SYMBOL(xfrm_policy_bysel_ctx); struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete) { @@ -519,7 +521,7 @@ EXPORT_SYMBOL(xfrm_policy_walk); /* Find policy to apply to this flow. */ -static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, +static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir, void **objp, atomic_t **obj_refp) { struct xfrm_policy *pol; @@ -533,9 +535,12 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, continue; match = xfrm_selector_match(sel, fl, family); + if (match) { - xfrm_pol_hold(pol); - break; + if (!security_xfrm_policy_lookup(pol, sk_sid, dir)) { + xfrm_pol_hold(pol); + break; + } } } read_unlock_bh(&xfrm_policy_lock); @@ -543,15 +548,37 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, *obj_refp = &pol->refcnt; } -static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl) +static inline int policy_to_flow_dir(int dir) +{ + if (XFRM_POLICY_IN == FLOW_DIR_IN && + XFRM_POLICY_OUT == FLOW_DIR_OUT && + XFRM_POLICY_FWD == FLOW_DIR_FWD) + return dir; + switch (dir) { + default: + case XFRM_POLICY_IN: + return FLOW_DIR_IN; + case XFRM_POLICY_OUT: + return FLOW_DIR_OUT; + case XFRM_POLICY_FWD: + return FLOW_DIR_FWD; + }; +} + +static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl, u32 sk_sid) { struct xfrm_policy *pol; read_lock_bh(&xfrm_policy_lock); if ((pol = sk->sk_policy[dir]) != NULL) { - int match = xfrm_selector_match(&pol->selector, fl, + int match = xfrm_selector_match(&pol->selector, fl, sk->sk_family); + int err = 0; + if (match) + err = security_xfrm_policy_lookup(pol, sk_sid, policy_to_flow_dir(dir)); + + if (match && !err) xfrm_pol_hold(pol); else pol = NULL; @@ -624,6 +651,10 @@ static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir) if (newp) { newp->selector = old->selector; + if (security_xfrm_policy_clone(old, newp)) { + kfree(newp); + return NULL; /* ENOMEM */ + } newp->lft = old->lft; newp->curlft = old->curlft; newp->action = old->action; @@ -735,22 +766,6 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, return err; } -static inline int policy_to_flow_dir(int dir) -{ - if (XFRM_POLICY_IN == FLOW_DIR_IN && - XFRM_POLICY_OUT == FLOW_DIR_OUT && - XFRM_POLICY_FWD == FLOW_DIR_FWD) - return dir; - switch (dir) { - default: - case XFRM_POLICY_IN: - return FLOW_DIR_IN; - case XFRM_POLICY_OUT: - return FLOW_DIR_OUT; - case XFRM_POLICY_FWD: - return FLOW_DIR_FWD; - }; -} static int stale_bundle(struct dst_entry *dst); @@ -769,19 +784,20 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, int err; u32 genid; u16 family = dst_orig->ops->family; + u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); + u32 sk_sid = security_sk_sid(sk, fl, dir); restart: genid = atomic_read(&flow_cache_genid); policy = NULL; if (sk && sk->sk_policy[1]) - policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); + policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, sk_sid); if (!policy) { /* To accelerate a bit... */ if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT]) return 0; - policy = flow_cache_lookup(fl, family, - policy_to_flow_dir(XFRM_POLICY_OUT), + policy = flow_cache_lookup(fl, sk_sid, family, dir, xfrm_policy_lookup); } @@ -934,8 +950,8 @@ xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start, return start; } -static int -_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family) +int +xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family) { struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); @@ -946,6 +962,7 @@ _decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family) xfrm_policy_put_afinfo(afinfo); return 0; } +EXPORT_SYMBOL(xfrm_decode_session); static inline int secpath_has_tunnel(struct sec_path *sp, int k) { @@ -962,16 +979,21 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, { struct xfrm_policy *pol; struct flowi fl; + u8 fl_dir = policy_to_flow_dir(dir); + u32 sk_sid; - if (_decode_session(skb, &fl, family) < 0) + if (xfrm_decode_session(skb, &fl, family) < 0) return 0; + nf_nat_decode_session(skb, &fl, family); + + sk_sid = security_sk_sid(sk, &fl, fl_dir); /* First, check used SA against their selectors. */ if (skb->sp) { int i; for (i=skb->sp->len-1; i>=0; i--) { - struct sec_decap_state *xvec = &(skb->sp->x[i]); + struct sec_decap_state *xvec = &(skb->sp->x[i]); if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family)) return 0; @@ -986,11 +1008,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, pol = NULL; if (sk && sk->sk_policy[dir]) - pol = xfrm_sk_policy_lookup(sk, dir, &fl); + pol = xfrm_sk_policy_lookup(sk, dir, &fl, sk_sid); if (!pol) - pol = flow_cache_lookup(&fl, family, - policy_to_flow_dir(dir), + pol = flow_cache_lookup(&fl, sk_sid, family, fl_dir, xfrm_policy_lookup); if (!pol) @@ -1035,7 +1056,7 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct flowi fl; - if (_decode_session(skb, &fl, family) < 0) + if (xfrm_decode_session(skb, &fl, family) < 0) return 0; return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 479effc97666..e12d0be5f976 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -10,7 +10,7 @@ * Split up af-specific functions * Derek Atkins <derek@ihtfp.com> * Add UDP Encapsulation - * + * */ #include <linux/workqueue.h> @@ -70,6 +70,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) x->type->destructor(x); xfrm_put_type(x->type); } + security_xfrm_state_free(x); kfree(x); } @@ -343,7 +344,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, selector. */ if (x->km.state == XFRM_STATE_VALID) { - if (!xfrm_selector_match(&x->sel, fl, family)) + if (!xfrm_selector_match(&x->sel, fl, family) || + !xfrm_sec_ctx_match(pol->security, x->security)) continue; if (!best || best->km.dying > x->km.dying || @@ -354,7 +356,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, acquire_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { - if (xfrm_selector_match(&x->sel, fl, family)) + if (xfrm_selector_match(&x->sel, fl, family) && + xfrm_sec_ctx_match(pol->security, x->security)) error = -ESRCH; } } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0cdd9a07e043..ac87a09ba83e 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -7,7 +7,7 @@ * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support - * + * */ #include <linux/module.h> @@ -88,6 +88,34 @@ static int verify_encap_tmpl(struct rtattr **xfrma) return 0; } + +static inline int verify_sec_ctx_len(struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_SEC_CTX - 1]; + struct xfrm_user_sec_ctx *uctx; + int len = 0; + + if (!rt) + return 0; + + if (rt->rta_len < sizeof(*uctx)) + return -EINVAL; + + uctx = RTA_DATA(rt); + + if (uctx->ctx_len > PAGE_SIZE) + return -EINVAL; + + len += sizeof(struct xfrm_user_sec_ctx); + len += uctx->ctx_len; + + if (uctx->len != len) + return -EINVAL; + + return 0; +} + + static int verify_newsa_info(struct xfrm_usersa_info *p, struct rtattr **xfrma) { @@ -145,6 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, goto out; if ((err = verify_encap_tmpl(xfrma))) goto out; + if ((err = verify_sec_ctx_len(xfrma))) + goto out; err = -EINVAL; switch (p->mode) { @@ -209,6 +239,30 @@ static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_a return 0; } + +static inline int xfrm_user_sec_ctx_size(struct xfrm_policy *xp) +{ + struct xfrm_sec_ctx *xfrm_ctx = xp->security; + int len = 0; + + if (xfrm_ctx) { + len += sizeof(struct xfrm_user_sec_ctx); + len += xfrm_ctx->ctx_len; + } + return len; +} + +static int attach_sec_ctx(struct xfrm_state *x, struct rtattr *u_arg) +{ + struct xfrm_user_sec_ctx *uctx; + + if (!u_arg) + return 0; + + uctx = RTA_DATA(u_arg); + return security_xfrm_state_alloc(x, uctx); +} + static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) { memcpy(&x->id, &p->id, sizeof(x->id)); @@ -253,6 +307,9 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p, if (err) goto error; + if ((err = attach_sec_ctx(x, xfrma[XFRMA_SEC_CTX-1]))) + goto error; + x->km.seq = p->seq; return x; @@ -272,11 +329,11 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) int err; struct km_event c; - err = verify_newsa_info(p, (struct rtattr **) xfrma); + err = verify_newsa_info(p, (struct rtattr **)xfrma); if (err) return err; - x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err); + x = xfrm_state_construct(p, (struct rtattr **)xfrma, &err); if (!x) return err; @@ -390,6 +447,19 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) if (x->encap) RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap); + if (x->security) { + int ctx_size = sizeof(struct xfrm_sec_ctx) + + x->security->ctx_len; + struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size); + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + uctx->exttype = XFRMA_SEC_CTX; + uctx->len = ctx_size; + uctx->ctx_doi = x->security->ctx_doi; + uctx->ctx_alg = x->security->ctx_alg; + uctx->ctx_len = x->security->ctx_len; + memcpy(uctx + 1, x->security->ctx_str, x->security->ctx_len); + } nlh->nlmsg_len = skb->tail - b; out: sp->this_idx++; @@ -603,6 +673,18 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) return verify_policy_dir(p->dir); } +static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_SEC_CTX-1]; + struct xfrm_user_sec_ctx *uctx; + + if (!rt) + return 0; + + uctx = RTA_DATA(rt); + return security_xfrm_policy_alloc(pol, uctx); +} + static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, int nr) { @@ -681,7 +763,10 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p, } copy_from_user_policy(xp, p); - err = copy_from_user_tmpl(xp, xfrma); + + if (!(err = copy_from_user_tmpl(xp, xfrma))) + err = copy_from_user_sec_ctx(xp, xfrma); + if (err) { *errp = err; kfree(xp); @@ -702,8 +787,11 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr err = verify_newpolicy_info(p); if (err) return err; + err = verify_sec_ctx_len((struct rtattr **)xfrma); + if (err) + return err; - xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err); + xp = xfrm_policy_construct(p, (struct rtattr **)xfrma, &err); if (!xp) return err; @@ -714,6 +802,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY; err = xfrm_policy_insert(p->dir, xp, excl); if (err) { + security_xfrm_policy_free(xp); kfree(xp); return err; } @@ -761,6 +850,27 @@ rtattr_failure: return -1; } +static int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb) +{ + if (xp->security) { + int ctx_size = sizeof(struct xfrm_sec_ctx) + + xp->security->ctx_len; + struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size); + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + uctx->exttype = XFRMA_SEC_CTX; + uctx->len = ctx_size; + uctx->ctx_doi = xp->security->ctx_doi; + uctx->ctx_alg = xp->security->ctx_alg; + uctx->ctx_len = xp->security->ctx_len; + memcpy(uctx + 1, xp->security->ctx_str, xp->security->ctx_len); + } + return 0; + + rtattr_failure: + return -1; +} + static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr) { struct xfrm_dump_info *sp = ptr; @@ -782,6 +892,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr copy_to_user_policy(xp, p, dir); if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; nlh->nlmsg_len = skb->tail - b; out: @@ -852,8 +964,25 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr if (p->index) xp = xfrm_policy_byid(p->dir, p->index, delete); - else - xp = xfrm_policy_bysel(p->dir, &p->sel, delete); + else { + struct rtattr **rtattrs = (struct rtattr **)xfrma; + struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1]; + struct xfrm_policy tmp; + + err = verify_sec_ctx_len(rtattrs); + if (err) + return err; + + memset(&tmp, 0, sizeof(struct xfrm_policy)); + if (rt) { + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + if ((err = security_xfrm_policy_alloc(&tmp, uctx))) + return err; + } + xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, delete); + security_xfrm_policy_free(&tmp); + } if (xp == NULL) return -ENOENT; @@ -1224,6 +1353,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; nlh->nlmsg_len = skb->tail - b; return skb->len; @@ -1241,6 +1372,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire)); + len += RTA_SPACE(xfrm_user_sec_ctx_size(xp)); skb = alloc_skb(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; @@ -1324,6 +1456,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, copy_to_user_policy(xp, &upe->pol, dir); if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; upe->hard = !!hard; nlh->nlmsg_len = skb->tail - b; @@ -1341,6 +1475,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire)); + len += RTA_SPACE(xfrm_user_sec_ctx_size(xp)); skb = alloc_skb(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; |