diff options
Diffstat (limited to 'net')
237 files changed, 6902 insertions, 7077 deletions
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 211b5686d719..a1b7117a9600 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -3,8 +3,7 @@ * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ -/* - * Based on patches from Jon Smirl <jonsmirl@gmail.com> +/* Based on patches from Jon Smirl <jonsmirl@gmail.com> * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com> * * This program is free software; you can redistribute it and/or modify @@ -58,16 +57,15 @@ #include <net/ipv6.h> #include <net/af_ieee802154.h> -/* - * Uncompress address function for source and +/* Uncompress address function for source and * destination address(non-multicast). * * address_mode is sam value or dam value. */ static int uncompress_addr(struct sk_buff *skb, - struct in6_addr *ipaddr, const u8 address_mode, - const u8 *lladdr, const u8 addr_type, - const u8 addr_len) + struct in6_addr *ipaddr, const u8 address_mode, + const u8 *lladdr, const u8 addr_type, + const u8 addr_len) { bool fail; @@ -140,13 +138,12 @@ static int uncompress_addr(struct sk_buff *skb, return 0; } -/* - * Uncompress address function for source context +/* Uncompress address function for source context * based address(non-multicast). */ static int uncompress_context_based_src_addr(struct sk_buff *skb, - struct in6_addr *ipaddr, - const u8 sam) + struct in6_addr *ipaddr, + const u8 sam) { switch (sam) { case LOWPAN_IPHC_ADDR_00: @@ -175,13 +172,13 @@ static int uncompress_context_based_src_addr(struct sk_buff *skb, } static int skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr, - struct net_device *dev, skb_delivery_cb deliver_skb) + struct net_device *dev, skb_delivery_cb deliver_skb) { struct sk_buff *new; int stat; - new = skb_copy_expand(skb, sizeof(struct ipv6hdr), skb_tailroom(skb), - GFP_ATOMIC); + new = skb_copy_expand(skb, sizeof(struct ipv6hdr), + skb_tailroom(skb), GFP_ATOMIC); kfree_skb(skb); if (!new) @@ -196,7 +193,7 @@ static int skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr, new->dev = dev; raw_dump_table(__func__, "raw skb data dump before receiving", - new->data, new->len); + new->data, new->len); stat = deliver_skb(new, dev); @@ -210,8 +207,8 @@ static int skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr, */ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb, - struct in6_addr *ipaddr, - const u8 dam) + struct in6_addr *ipaddr, + const u8 dam) { bool fail; @@ -300,7 +297,6 @@ uncompress_udp_header(struct sk_buff *skb, struct udphdr *uh) default: pr_debug("ERROR: unknown UDP format\n"); goto err; - break; } pr_debug("uncompressed UDP ports: src = %d, dst = %d\n", @@ -314,8 +310,7 @@ uncompress_udp_header(struct sk_buff *skb, struct udphdr *uh) fail |= lowpan_fetch_skb(skb, &uh->check, 2); } - /* - * UDP lenght needs to be infered from the lower layers + /* UDP lenght needs to be infered from the lower layers * here, we obtain the hint from the remaining size of the * frame */ @@ -338,16 +333,17 @@ err: static const u8 lowpan_ttl_values[] = { 0, 1, 64, 255 }; int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, - const u8 *saddr, const u8 saddr_type, const u8 saddr_len, - const u8 *daddr, const u8 daddr_type, const u8 daddr_len, - u8 iphc0, u8 iphc1, skb_delivery_cb deliver_skb) + const u8 *saddr, const u8 saddr_type, + const u8 saddr_len, const u8 *daddr, + const u8 daddr_type, const u8 daddr_len, + u8 iphc0, u8 iphc1, skb_delivery_cb deliver_skb) { struct ipv6hdr hdr = {}; u8 tmp, num_context = 0; int err; raw_dump_table(__func__, "raw skb data dump uncompressed", - skb->data, skb->len); + skb->data, skb->len); /* another if the CID flag is set */ if (iphc1 & LOWPAN_IPHC_CID) { @@ -360,8 +356,7 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, /* Traffic Class and Flow Label */ switch ((iphc0 & LOWPAN_IPHC_TF) >> 3) { - /* - * Traffic Class and FLow Label carried in-line + /* Traffic Class and FLow Label carried in-line * ECN + DSCP + 4-bit Pad + Flow Label (4 bytes) */ case 0: /* 00b */ @@ -374,8 +369,7 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, hdr.flow_lbl[0] = ((tmp >> 2) & 0x30) | (tmp << 6) | (hdr.flow_lbl[0] & 0x0f); break; - /* - * Traffic class carried in-line + /* Traffic class carried in-line * ECN + DSCP (1 byte), Flow Label is elided */ case 2: /* 10b */ @@ -385,8 +379,7 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, hdr.priority = ((tmp >> 2) & 0x0f); hdr.flow_lbl[0] = ((tmp << 6) & 0xC0) | ((tmp >> 2) & 0x30); break; - /* - * Flow Label carried in-line + /* Flow Label carried in-line * ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided */ case 1: /* 01b */ @@ -415,9 +408,9 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, } /* Hop Limit */ - if ((iphc0 & 0x03) != LOWPAN_IPHC_TTL_I) + if ((iphc0 & 0x03) != LOWPAN_IPHC_TTL_I) { hdr.hop_limit = lowpan_ttl_values[iphc0 & 0x03]; - else { + } else { if (lowpan_fetch_skb_u8(skb, &(hdr.hop_limit))) goto drop; } @@ -429,12 +422,12 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, /* Source address context based uncompression */ pr_debug("SAC bit is set. Handle context based source address.\n"); err = uncompress_context_based_src_addr( - skb, &hdr.saddr, tmp); + skb, &hdr.saddr, tmp); } else { /* Source address uncompression */ pr_debug("source address stateless compression\n"); err = uncompress_addr(skb, &hdr.saddr, tmp, saddr, - saddr_type, saddr_len); + saddr_type, saddr_len); } /* Check on error of previous branch */ @@ -457,9 +450,9 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, } } else { err = uncompress_addr(skb, &hdr.daddr, tmp, daddr, - daddr_type, daddr_len); + daddr_type, daddr_len); pr_debug("dest: stateless compression mode %d dest %pI6c\n", - tmp, &hdr.daddr); + tmp, &hdr.daddr); if (err) goto drop; } @@ -468,11 +461,11 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, if (iphc0 & LOWPAN_IPHC_NH_C) { struct udphdr uh; struct sk_buff *new; + if (uncompress_udp_header(skb, &uh)) goto drop; - /* - * replace the compressed UDP head by the uncompressed UDP + /* replace the compressed UDP head by the uncompressed UDP * header */ new = skb_copy_expand(skb, sizeof(struct udphdr), @@ -489,7 +482,7 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, skb_copy_to_linear_data(skb, &uh, sizeof(struct udphdr)); raw_dump_table(__func__, "raw UDP header dump", - (u8 *)&uh, sizeof(uh)); + (u8 *)&uh, sizeof(uh)); hdr.nexthdr = UIP_PROTO_UDP; } @@ -504,8 +497,8 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, hdr.version, ntohs(hdr.payload_len), hdr.nexthdr, hdr.hop_limit, &hdr.daddr); - raw_dump_table(__func__, "raw header dump", (u8 *)&hdr, - sizeof(hdr)); + raw_dump_table(__func__, "raw header dump", + (u8 *)&hdr, sizeof(hdr)); return skb_deliver(skb, &hdr, dev, deliver_skb); @@ -516,8 +509,8 @@ drop: EXPORT_SYMBOL_GPL(lowpan_process_data); static u8 lowpan_compress_addr_64(u8 **hc06_ptr, u8 shift, - const struct in6_addr *ipaddr, - const unsigned char *lladdr) + const struct in6_addr *ipaddr, + const unsigned char *lladdr) { u8 val = 0; @@ -530,14 +523,14 @@ static u8 lowpan_compress_addr_64(u8 **hc06_ptr, u8 shift, *hc06_ptr += 2; val = 2; /* 16-bits */ raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)", - *hc06_ptr - 2, 2); + *hc06_ptr - 2, 2); } else { /* do not compress IID => xxxx::IID */ memcpy(*hc06_ptr, &ipaddr->s6_addr16[4], 8); *hc06_ptr += 8; val = 1; /* 64-bits */ raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)", - *hc06_ptr - 8, 8); + *hc06_ptr - 8, 8); } return rol8(val, shift); @@ -601,8 +594,8 @@ static void compress_udp_header(u8 **hc06_ptr, struct sk_buff *skb) } int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, - unsigned short type, const void *_daddr, - const void *_saddr, unsigned int len) + unsigned short type, const void *_daddr, + const void *_saddr, unsigned int len) { u8 tmp, iphc0, iphc1, *hc06_ptr; struct ipv6hdr *hdr; @@ -616,14 +609,13 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, pr_debug("IPv6 header dump:\n\tversion = %d\n\tlength = %d\n" "\tnexthdr = 0x%02x\n\thop_lim = %d\n\tdest = %pI6c\n", - hdr->version, ntohs(hdr->payload_len), hdr->nexthdr, - hdr->hop_limit, &hdr->daddr); + hdr->version, ntohs(hdr->payload_len), hdr->nexthdr, + hdr->hop_limit, &hdr->daddr); raw_dump_table(__func__, "raw skb network header dump", - skb_network_header(skb), sizeof(struct ipv6hdr)); + skb_network_header(skb), sizeof(struct ipv6hdr)); - /* - * As we copy some bit-length fields, in the IPHC encoding bytes, + /* As we copy some bit-length fields, in the IPHC encoding bytes, * we sometimes use |= * If the field is 0, and the current bit value in memory is 1, * this does not work. We therefore reset the IPHC encoding here @@ -639,11 +631,10 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, (unsigned char *)_daddr, IEEE802154_ADDR_LEN); raw_dump_table(__func__, - "sending raw skb network uncompressed packet", - skb->data, skb->len); + "sending raw skb network uncompressed packet", + skb->data, skb->len); - /* - * Traffic class, flow label + /* Traffic class, flow label * If flow label is 0, compress it. If traffic class is 0, compress it * We have to process both in the same time as the offset of traffic * class depends on the presence of version and flow label @@ -654,11 +645,11 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, tmp = ((tmp & 0x03) << 6) | (tmp >> 2); if (((hdr->flow_lbl[0] & 0x0F) == 0) && - (hdr->flow_lbl[1] == 0) && (hdr->flow_lbl[2] == 0)) { + (hdr->flow_lbl[1] == 0) && (hdr->flow_lbl[2] == 0)) { /* flow label can be compressed */ iphc0 |= LOWPAN_IPHC_FL_C; if ((hdr->priority == 0) && - ((hdr->flow_lbl[0] & 0xF0) == 0)) { + ((hdr->flow_lbl[0] & 0xF0) == 0)) { /* compress (elide) all */ iphc0 |= LOWPAN_IPHC_TC_C; } else { @@ -669,7 +660,7 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, } else { /* Flow label cannot be compressed */ if ((hdr->priority == 0) && - ((hdr->flow_lbl[0] & 0xF0) == 0)) { + ((hdr->flow_lbl[0] & 0xF0) == 0)) { /* compress only traffic class */ iphc0 |= LOWPAN_IPHC_TC_C; *hc06_ptr = (tmp & 0xc0) | (hdr->flow_lbl[0] & 0x0F); @@ -695,8 +686,7 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, hc06_ptr += 1; } - /* - * Hop limit + /* Hop limit * if 1: compress, encoding is 01 * if 64: compress, encoding is 10 * if 255: compress, encoding is 11 @@ -793,7 +783,7 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, pr_debug("header len %d skb %u\n", (int)(hc06_ptr - head), skb->len); raw_dump_table(__func__, "raw skb data dump compressed", - skb->data, skb->len); + skb->data, skb->len); return 0; } EXPORT_SYMBOL_GPL(lowpan_header_compress); diff --git a/net/802/fc.c b/net/802/fc.c index 05eea6b98bb8..7c174b6750cd 100644 --- a/net/802/fc.c +++ b/net/802/fc.c @@ -126,6 +126,6 @@ static void fc_setup(struct net_device *dev) */ struct net_device *alloc_fcdev(int sizeof_priv) { - return alloc_netdev(sizeof_priv, "fc%d", fc_setup); + return alloc_netdev(sizeof_priv, "fc%d", NET_NAME_UNKNOWN, fc_setup); } EXPORT_SYMBOL(alloc_fcdev); diff --git a/net/802/fddi.c b/net/802/fddi.c index 9cda40661e0d..59e7346f1193 100644 --- a/net/802/fddi.c +++ b/net/802/fddi.c @@ -207,7 +207,8 @@ static void fddi_setup(struct net_device *dev) */ struct net_device *alloc_fddidev(int sizeof_priv) { - return alloc_netdev(sizeof_priv, "fddi%d", fddi_setup); + return alloc_netdev(sizeof_priv, "fddi%d", NET_NAME_UNKNOWN, + fddi_setup); } EXPORT_SYMBOL(alloc_fddidev); diff --git a/net/802/hippi.c b/net/802/hippi.c index 5ff2a718ddca..2e03f8259dd5 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -228,7 +228,8 @@ static void hippi_setup(struct net_device *dev) struct net_device *alloc_hippi_dev(int sizeof_priv) { - return alloc_netdev(sizeof_priv, "hip%d", hippi_setup); + return alloc_netdev(sizeof_priv, "hip%d", NET_NAME_UNKNOWN, + hippi_setup); } EXPORT_SYMBOL(alloc_hippi_dev); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 44ebd5c2cd4a..cba9c212a730 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -250,7 +250,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } - new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, vlan_setup); + new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, + NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 9012b1c922b6..75d427763992 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -114,8 +114,11 @@ EXPORT_SYMBOL(vlan_dev_vlan_proto); static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) { - if (skb_cow(skb, skb_headroom(skb)) < 0) + if (skb_cow(skb, skb_headroom(skb)) < 0) { + kfree_skb(skb); return NULL; + } + memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); skb->mac_header += VLAN_HLEN; return skb; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ad2ac3c00398..35a6b6b15e8a 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -385,6 +385,8 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: + case SIOCSHWTSTAMP: + case SIOCGHWTSTAMP: if (netif_device_present(real_dev) && ops->ndo_do_ioctl) err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd); break; @@ -627,8 +629,6 @@ static void vlan_dev_uninit(struct net_device *dev) struct vlan_dev_priv *vlan = vlan_dev_priv(dev); int i; - free_percpu(vlan->vlan_pcpu_stats); - vlan->vlan_pcpu_stats = NULL; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { while ((pm = vlan->egress_priority_map[i]) != NULL) { vlan->egress_priority_map[i] = pm->next; @@ -785,6 +785,15 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_get_lock_subclass = vlan_dev_get_lock_subclass, }; +static void vlan_dev_free(struct net_device *dev) +{ + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + + free_percpu(vlan->vlan_pcpu_stats); + vlan->vlan_pcpu_stats = NULL; + free_netdev(dev); +} + void vlan_setup(struct net_device *dev) { ether_setup(dev); @@ -794,7 +803,7 @@ void vlan_setup(struct net_device *dev) dev->tx_queue_len = 0; dev->netdev_ops = &vlan_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = vlan_dev_free; dev->ethtool_ops = &vlan_ethtool_ops; memset(dev->broadcast, 0, ETH_ALEN); diff --git a/net/9p/client.c b/net/9p/client.c index 0004cbaac4a4..e86a9bea1d16 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -959,7 +959,6 @@ static int p9_client_version(struct p9_client *c) break; default: return -EINVAL; - break; } if (IS_ERR(req)) diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 01a1082e02b3..c00897f65a31 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1489,8 +1489,6 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev, goto drop; /* Queue packet (standard) */ - skb->sk = sock; - if (sock_queue_rcv_skb(sock, skb) < 0) goto drop; @@ -1644,7 +1642,6 @@ static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr if (!skb) goto out; - skb->sk = sk; skb_reserve(skb, ddp_dl->header_length); skb_reserve(skb, dev->hard_header_len); skb->dev = dev; @@ -1808,7 +1805,7 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) long amount = 0; if (skb) - amount = skb->len - sizeof(struct ddpehdr); + amount = skb->len - sizeof(struct ddpehdr); rc = put_user(amount, (int __user *)argp); break; } diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c index 6c8016f61866..e4158b8b926d 100644 --- a/net/appletalk/dev.c +++ b/net/appletalk/dev.c @@ -39,6 +39,7 @@ static void ltalk_setup(struct net_device *dev) struct net_device *alloc_ltalkdev(int sizeof_priv) { - return alloc_netdev(sizeof_priv, "lt%d", ltalk_setup); + return alloc_netdev(sizeof_priv, "lt%d", NET_NAME_UNKNOWN, + ltalk_setup); } EXPORT_SYMBOL(alloc_ltalkdev); diff --git a/net/atm/br2684.c b/net/atm/br2684.c index 403e71fa88fe..cc78538d163b 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -682,8 +682,8 @@ static int br2684_create(void __user *arg) netdev = alloc_netdev(sizeof(struct br2684_dev), ni.ifname[0] ? ni.ifname : "nas%d", - (payload == p_routed) ? - br2684_setup_routed : br2684_setup); + NET_NAME_UNKNOWN, + (payload == p_routed) ? br2684_setup_routed : br2684_setup); if (!netdev) return -ENOMEM; diff --git a/net/atm/clip.c b/net/atm/clip.c index ba291ce4bdff..46339040fef0 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -520,7 +520,8 @@ static int clip_create(int number) if (PRIV(dev)->number >= number) number = PRIV(dev)->number + 1; } - dev = alloc_netdev(sizeof(struct clip_priv), "", clip_setup); + dev = alloc_netdev(sizeof(struct clip_priv), "", NET_NAME_UNKNOWN, + clip_setup); if (!dev) return -ENOMEM; clip_priv = PRIV(dev); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 6f0d9ec37950..a957c8140721 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -800,11 +800,6 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, bla_dst = (struct batadv_bla_claim_dst *)hw_dst; bla_dst_own = &bat_priv->bla.claim_dest; - /* check if it is a claim packet in general */ - if (memcmp(bla_dst->magic, bla_dst_own->magic, - sizeof(bla_dst->magic)) != 0) - return 0; - /* if announcement packet, use the source, * otherwise assume it is in the hw_src */ @@ -866,12 +861,13 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct sk_buff *skb) { - struct batadv_bla_claim_dst *bla_dst; + struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; uint8_t *hw_src, *hw_dst; - struct vlan_ethhdr *vhdr; + struct vlan_hdr *vhdr, vhdr_buf; struct ethhdr *ethhdr; struct arphdr *arphdr; unsigned short vid; + int vlan_depth = 0; __be16 proto; int headlen; int ret; @@ -882,9 +878,24 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, proto = ethhdr->h_proto; headlen = ETH_HLEN; if (vid & BATADV_VLAN_HAS_TAG) { - vhdr = vlan_eth_hdr(skb); - proto = vhdr->h_vlan_encapsulated_proto; - headlen += VLAN_HLEN; + /* Traverse the VLAN/Ethertypes. + * + * At this point it is known that the first protocol is a VLAN + * header, so start checking at the encapsulated protocol. + * + * The depth of the VLAN headers is recorded to drop BLA claim + * frames encapsulated into multiple VLAN headers (QinQ). + */ + do { + vhdr = skb_header_pointer(skb, headlen, VLAN_HLEN, + &vhdr_buf); + if (!vhdr) + return 0; + + proto = vhdr->h_vlan_encapsulated_proto; + headlen += VLAN_HLEN; + vlan_depth++; + } while (proto == htons(ETH_P_8021Q)); } if (proto != htons(ETH_P_ARP)) @@ -914,6 +925,19 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, hw_src = (uint8_t *)arphdr + sizeof(struct arphdr); hw_dst = hw_src + ETH_ALEN + 4; bla_dst = (struct batadv_bla_claim_dst *)hw_dst; + bla_dst_own = &bat_priv->bla.claim_dest; + + /* check if it is a claim frame in general */ + if (memcmp(bla_dst->magic, bla_dst_own->magic, + sizeof(bla_dst->magic)) != 0) + return 0; + + /* check if there is a claim frame encapsulated deeper in (QinQ) and + * drop that, as this is not supported by BLA but should also not be + * sent via the mesh. + */ + if (vlan_depth > 1) + return 1; /* check if it is a claim frame. */ ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index e7ee65dc20bf..e0a723991c54 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -448,10 +448,15 @@ out: * possibly free it * @softif_vlan: the vlan object to release */ -void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *softif_vlan) +void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan) { - if (atomic_dec_and_test(&softif_vlan->refcount)) - kfree_rcu(softif_vlan, rcu); + if (atomic_dec_and_test(&vlan->refcount)) { + spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); + hlist_del_rcu(&vlan->list); + spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); + + kfree_rcu(vlan, rcu); + } } /** @@ -505,6 +510,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) if (!vlan) return -ENOMEM; + vlan->bat_priv = bat_priv; vlan->vid = vid; atomic_set(&vlan->refcount, 1); @@ -516,6 +522,10 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) return err; } + spin_lock_bh(&bat_priv->softif_vlan_list_lock); + hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); + /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ @@ -523,10 +533,6 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); - spin_lock_bh(&bat_priv->softif_vlan_list_lock); - hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); - spin_unlock_bh(&bat_priv->softif_vlan_list_lock); - return 0; } @@ -538,18 +544,13 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan) { - spin_lock_bh(&bat_priv->softif_vlan_list_lock); - hlist_del_rcu(&vlan->list); - spin_unlock_bh(&bat_priv->softif_vlan_list_lock); - - batadv_sysfs_del_vlan(bat_priv, vlan); - /* explicitly remove the associated TT local entry because it is marked * with the NOPURGE flag */ batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, vlan->vid, "vlan interface destroyed", false); + batadv_sysfs_del_vlan(bat_priv, vlan); batadv_softif_vlan_free_ref(vlan); } @@ -567,6 +568,8 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); + struct batadv_softif_vlan *vlan; + int ret; /* only 802.1Q vlans are supported. * batman-adv does not know how to handle other types @@ -576,7 +579,36 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, vid |= BATADV_VLAN_HAS_TAG; - return batadv_softif_create_vlan(bat_priv, vid); + /* if a new vlan is getting created and it already exists, it means that + * it was not deleted yet. batadv_softif_vlan_get() increases the + * refcount in order to revive the object. + * + * if it does not exist then create it. + */ + vlan = batadv_softif_vlan_get(bat_priv, vid); + if (!vlan) + return batadv_softif_create_vlan(bat_priv, vid); + + /* recreate the sysfs object if it was already destroyed (and it should + * be since we received a kill_vid() for this vlan + */ + if (!vlan->kobj) { + ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); + if (ret) { + batadv_softif_vlan_free_ref(vlan); + return ret; + } + } + + /* add a new TT local entry. This one will be marked with the NOPURGE + * flag. This must be added again, even if the vlan object already + * exists, because the entry was deleted by kill_vid() + */ + batadv_tt_local_add(bat_priv->soft_iface, + bat_priv->soft_iface->dev_addr, vid, + BATADV_NULL_IFINDEX, BATADV_NO_MARK); + + return 0; } /** @@ -895,7 +927,7 @@ struct net_device *batadv_softif_create(const char *name) int ret; soft_iface = alloc_netdev(sizeof(struct batadv_priv), name, - batadv_softif_init_early); + NET_NAME_UNKNOWN, batadv_softif_init_early); if (!soft_iface) return NULL; diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index fc47baa888c5..f40cb0436eba 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -900,32 +900,24 @@ int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, bat_kobj = &bat_priv->soft_iface->dev.kobj; - uevent_env[0] = kmalloc(strlen(BATADV_UEV_TYPE_VAR) + - strlen(batadv_uev_type_str[type]) + 1, - GFP_ATOMIC); + uevent_env[0] = kasprintf(GFP_ATOMIC, + "%s%s", BATADV_UEV_TYPE_VAR, + batadv_uev_type_str[type]); if (!uevent_env[0]) goto out; - sprintf(uevent_env[0], "%s%s", BATADV_UEV_TYPE_VAR, - batadv_uev_type_str[type]); - - uevent_env[1] = kmalloc(strlen(BATADV_UEV_ACTION_VAR) + - strlen(batadv_uev_action_str[action]) + 1, - GFP_ATOMIC); + uevent_env[1] = kasprintf(GFP_ATOMIC, + "%s%s", BATADV_UEV_ACTION_VAR, + batadv_uev_action_str[action]); if (!uevent_env[1]) goto out; - sprintf(uevent_env[1], "%s%s", BATADV_UEV_ACTION_VAR, - batadv_uev_action_str[action]); - /* If the event is DEL, ignore the data field */ if (action != BATADV_UEV_DEL) { - uevent_env[2] = kmalloc(strlen(BATADV_UEV_DATA_VAR) + - strlen(data) + 1, GFP_ATOMIC); + uevent_env[2] = kasprintf(GFP_ATOMIC, + "%s%s", BATADV_UEV_DATA_VAR, data); if (!uevent_env[2]) goto out; - - sprintf(uevent_env[2], "%s%s", BATADV_UEV_DATA_VAR, data); } ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env); diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index d636bde72c9a..5f59e7f899a0 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -511,6 +511,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_tt_local_entry *tt_local; struct batadv_tt_global_entry *tt_global = NULL; + struct batadv_softif_vlan *vlan; struct net_device *in_dev = NULL; struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry; @@ -572,6 +573,9 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, if (!tt_local) goto out; + /* increase the refcounter of the related vlan */ + vlan = batadv_softif_vlan_get(bat_priv, vid); + batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n", addr, BATADV_PRINT_VID(vid), @@ -604,6 +608,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ batadv_tt_local_entry_free_ref(tt_local); + batadv_softif_vlan_free_ref(vlan); goto out; } @@ -1009,6 +1014,7 @@ uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, { struct batadv_tt_local_entry *tt_local_entry; uint16_t flags, curr_flags = BATADV_NO_FLAGS; + struct batadv_softif_vlan *vlan; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) @@ -1039,6 +1045,11 @@ uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, hlist_del_rcu(&tt_local_entry->common.hash_entry); batadv_tt_local_entry_free_ref(tt_local_entry); + /* decrease the reference held for this vlan */ + vlan = batadv_softif_vlan_get(bat_priv, vid); + batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_free_ref(vlan); + out: if (tt_local_entry) batadv_tt_local_entry_free_ref(tt_local_entry); @@ -1111,6 +1122,7 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; + struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; uint32_t i; @@ -1131,6 +1143,13 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) tt_local = container_of(tt_common_entry, struct batadv_tt_local_entry, common); + + /* decrease the reference held for this vlan */ + vlan = batadv_softif_vlan_get(bat_priv, + tt_common_entry->vid); + batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_free_ref(vlan); + batadv_tt_local_entry_free_ref(tt_local); } spin_unlock_bh(list_lock); @@ -3139,6 +3158,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_local_entry *tt_local; + struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ @@ -3167,6 +3187,12 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) tt_local = container_of(tt_common, struct batadv_tt_local_entry, common); + + /* decrease the reference held for this vlan */ + vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid); + batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_free_ref(vlan); + batadv_tt_local_entry_free_ref(tt_local); } spin_unlock_bh(list_lock); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 34891a56773f..8854c05622a9 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -687,6 +687,7 @@ struct batadv_priv_nc { /** * struct batadv_softif_vlan - per VLAN attributes set + * @bat_priv: pointer to the mesh object * @vid: VLAN identifier * @kobj: kobject for sysfs vlan subdirectory * @ap_isolation: AP isolation state @@ -696,6 +697,7 @@ struct batadv_priv_nc { * @rcu: struct used for freeing in a RCU-safe manner */ struct batadv_softif_vlan { + struct batadv_priv *bat_priv; unsigned short vid; struct kobject *kobj; atomic_t ap_isolation; /* boolean */ diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 5a7f81df603c..206b65ccd5b8 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -712,7 +712,7 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) unsigned long flags; netdev = alloc_netdev(sizeof(struct lowpan_dev), IFACE_NAME_TEMPLATE, - netdev_setup); + NET_NAME_UNKNOWN, netdev_setup); if (!netdev) return -ENOMEM; diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index a841d3e776c5..85bcc21e84d2 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -538,8 +538,9 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) /* session struct allocated as private part of net_device */ dev = alloc_netdev(sizeof(struct bnep_session), - (*req->device) ? req->device : "bnep%d", - bnep_net_setup); + (*req->device) ? req->device : "bnep%d", + NET_NAME_UNKNOWN, + bnep_net_setup); if (!dev) return -ENOMEM; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b524c36c1273..0bb9d8b63dd2 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -93,7 +93,7 @@ static void fdb_rcu_free(struct rcu_head *head) static void fdb_add_hw(struct net_bridge *br, const unsigned char *addr) { int err; - struct net_bridge_port *p, *tmp; + struct net_bridge_port *p; ASSERT_RTNL(); @@ -107,11 +107,9 @@ static void fdb_add_hw(struct net_bridge *br, const unsigned char *addr) return; undo: - list_for_each_entry(tmp, &br->port_list, list) { - if (tmp == p) - break; - if (!br_promisc_port(tmp)) - dev_uc_del(tmp->dev, addr); + list_for_each_entry_continue_reverse(p, &br->port_list, list) { + if (!br_promisc_port(p)) + dev_uc_del(p->dev, addr); } } @@ -678,6 +676,7 @@ errout: int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, + struct net_device *filter_dev, int idx) { struct net_bridge *br = netdev_priv(dev); @@ -693,6 +692,19 @@ int br_fdb_dump(struct sk_buff *skb, if (idx < cb->args[0]) goto skip; + if (filter_dev && + (!f->dst || f->dst->dev != filter_dev)) { + if (filter_dev != dev) + goto skip; + /* !f->dst is a speacial case for bridge + * It means the MAC belongs to the bridge + * Therefore need a little more filtering + * we only want to dump the !f->dst case + */ + if (f->dst) + goto skip; + } + if (fdb_fill_info(skb, br, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 3eca3fdf8fe1..078d336a1f37 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -344,7 +344,7 @@ int br_add_bridge(struct net *net, const char *name) struct net_device *dev; int res; - dev = alloc_netdev(sizeof(struct net_bridge), name, + dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN, br_dev_setup); if (!dev) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index abfa0b65a111..b4845f4b2bb4 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2216,6 +2216,43 @@ unlock: EXPORT_SYMBOL_GPL(br_multicast_list_adjacent); /** + * br_multicast_has_querier_anywhere - Checks for a querier on a bridge + * @dev: The bridge port providing the bridge on which to check for a querier + * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 + * + * Checks whether the given interface has a bridge on top and if so returns + * true if a valid querier exists anywhere on the bridged link layer. + * Otherwise returns false. + */ +bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) +{ + struct net_bridge *br; + struct net_bridge_port *port; + struct ethhdr eth; + bool ret = false; + + rcu_read_lock(); + if (!br_port_exists(dev)) + goto unlock; + + port = br_port_get_rcu(dev); + if (!port || !port->br) + goto unlock; + + br = port->br; + + memset(ð, 0, sizeof(eth)); + eth.h_proto = htons(proto); + + ret = br_multicast_querier_exists(br, ð); + +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere); + +/** * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port * @dev: The bridge port adjacent to which to check for a querier * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 23caf5b0309e..62a7fa2e3569 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -399,7 +399,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 nlh_flags); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct net_device *dev, int idx); + struct net_device *dev, struct net_device *fdev, int idx); int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 629dc77874a9..4ce0b313f72c 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -14,6 +14,9 @@ config NFT_BRIDGE_META help Add support for bridge dedicated meta key. +config NF_LOG_BRIDGE + tristate "Bridge packet logging" + endif # NF_TABLES_BRIDGE menuconfig BRIDGE_NF_EBTABLES @@ -202,22 +205,6 @@ config BRIDGE_EBT_LOG To compile it as a module, choose M here. If unsure, say N. -config BRIDGE_EBT_ULOG - tristate "ebt: ulog support (OBSOLETE)" - help - This option enables the old bridge-specific "ebt_ulog" implementation - which has been obsoleted by the new "nfnetlink_log" code (see - CONFIG_NETFILTER_NETLINK_LOG). - - This option adds the ulog watcher, that you can use in any rule - in any ebtables table. The packet is passed to a userspace - logging daemon using netlink multicast sockets. This differs - from the log watcher in the sense that the complete packet is - sent to userspace instead of a descriptive text and that - netlink multicast sockets are used instead of the syslog. - - To compile it as a module, choose M here. If unsure, say N. - config BRIDGE_EBT_NFLOG tristate "ebt: nflog support" help diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 6f2f3943d66f..1f78ea0d90e4 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -5,6 +5,9 @@ obj-$(CONFIG_NF_TABLES_BRIDGE) += nf_tables_bridge.o obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o +# packet logging +obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o + obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o # tables diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 5322a36867a3..17f2e4bc2a29 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -186,6 +186,10 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par) li.u.log.level = info->loglevel; li.u.log.logflags = info->bitmask; + /* Remember that we have to use ebt_log_packet() not to break backward + * compatibility. We cannot use the default bridge packet logger via + * nf_log_packet() with NFT_LOG_TYPE_LOG here. --Pablo + */ if (info->bitmask & EBT_LOG_NFLOG) nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in, par->out, &li, "%s", info->prefix); @@ -205,54 +209,13 @@ static struct xt_target ebt_log_tg_reg __read_mostly = { .me = THIS_MODULE, }; -static struct nf_logger ebt_log_logger __read_mostly = { - .name = "ebt_log", - .logfn = &ebt_log_packet, - .me = THIS_MODULE, -}; - -static int __net_init ebt_log_net_init(struct net *net) -{ - nf_log_set(net, NFPROTO_BRIDGE, &ebt_log_logger); - return 0; -} - -static void __net_exit ebt_log_net_fini(struct net *net) -{ - nf_log_unset(net, &ebt_log_logger); -} - -static struct pernet_operations ebt_log_net_ops = { - .init = ebt_log_net_init, - .exit = ebt_log_net_fini, -}; - static int __init ebt_log_init(void) { - int ret; - - ret = register_pernet_subsys(&ebt_log_net_ops); - if (ret < 0) - goto err_pernet; - - ret = xt_register_target(&ebt_log_tg_reg); - if (ret < 0) - goto err_target; - - nf_log_register(NFPROTO_BRIDGE, &ebt_log_logger); - - return ret; - -err_target: - unregister_pernet_subsys(&ebt_log_net_ops); -err_pernet: - return ret; + return xt_register_target(&ebt_log_tg_reg); } static void __exit ebt_log_fini(void) { - unregister_pernet_subsys(&ebt_log_net_ops); - nf_log_unregister(&ebt_log_logger); xt_unregister_target(&ebt_log_tg_reg); } diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c deleted file mode 100644 index 7c470c371e14..000000000000 --- a/net/bridge/netfilter/ebt_ulog.c +++ /dev/null @@ -1,393 +0,0 @@ -/* - * netfilter module for userspace bridged Ethernet frames logging daemons - * - * Authors: - * Bart De Schuymer <bdschuym@pandora.be> - * Harald Welte <laforge@netfilter.org> - * - * November, 2004 - * - * Based on ipt_ULOG.c, which is - * (C) 2000-2002 by Harald Welte <laforge@netfilter.org> - * - * This module accepts two parameters: - * - * nlbufsiz: - * The parameter specifies how big the buffer for each netlink multicast - * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will - * get accumulated in the kernel until they are sent to userspace. It is - * NOT possible to allocate more than 128kB, and it is strongly discouraged, - * because atomically allocating 128kB inside the network rx softirq is not - * reliable. Please also keep in mind that this buffer size is allocated for - * each nlgroup you are using, so the total kernel memory usage increases - * by that factor. - * - * flushtimeout: - * Specify, after how many hundredths of a second the queue should be - * flushed even if it is not full yet. - * - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/socket.h> -#include <linux/skbuff.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <net/netlink.h> -#include <linux/netdevice.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_bridge/ebtables.h> -#include <linux/netfilter_bridge/ebt_ulog.h> -#include <net/netfilter/nf_log.h> -#include <net/netns/generic.h> -#include <net/sock.h> -#include "../br_private.h" - -static unsigned int nlbufsiz = NLMSG_GOODSIZE; -module_param(nlbufsiz, uint, 0600); -MODULE_PARM_DESC(nlbufsiz, "netlink buffer size (number of bytes) " - "(defaults to 4096)"); - -static unsigned int flushtimeout = 10; -module_param(flushtimeout, uint, 0600); -MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths ofa second) " - "(defaults to 10)"); - -typedef struct { - unsigned int qlen; /* number of nlmsgs' in the skb */ - struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ - struct sk_buff *skb; /* the pre-allocated skb */ - struct timer_list timer; /* the timer function */ - spinlock_t lock; /* the per-queue lock */ -} ebt_ulog_buff_t; - -static int ebt_ulog_net_id __read_mostly; -struct ebt_ulog_net { - unsigned int nlgroup[EBT_ULOG_MAXNLGROUPS]; - ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS]; - struct sock *ebtulognl; -}; - -static struct ebt_ulog_net *ebt_ulog_pernet(struct net *net) -{ - return net_generic(net, ebt_ulog_net_id); -} - -/* send one ulog_buff_t to userspace */ -static void ulog_send(struct ebt_ulog_net *ebt, unsigned int nlgroup) -{ - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[nlgroup]; - - del_timer(&ub->timer); - - if (!ub->skb) - return; - - /* last nlmsg needs NLMSG_DONE */ - if (ub->qlen > 1) - ub->lastnlh->nlmsg_type = NLMSG_DONE; - - NETLINK_CB(ub->skb).dst_group = nlgroup + 1; - netlink_broadcast(ebt->ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC); - - ub->qlen = 0; - ub->skb = NULL; -} - -/* timer function to flush queue in flushtimeout time */ -static void ulog_timer(unsigned long data) -{ - struct ebt_ulog_net *ebt = container_of((void *)data, - struct ebt_ulog_net, - nlgroup[*(unsigned int *)data]); - - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[*(unsigned int *)data]; - spin_lock_bh(&ub->lock); - if (ub->skb) - ulog_send(ebt, *(unsigned int *)data); - spin_unlock_bh(&ub->lock); -} - -static struct sk_buff *ulog_alloc_skb(unsigned int size) -{ - struct sk_buff *skb; - unsigned int n; - - n = max(size, nlbufsiz); - skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); - if (!skb) { - if (n > size) { - /* try to allocate only as much as we need for - * current packet */ - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - pr_debug("cannot even allocate buffer of size %ub\n", - size); - } - } - - return skb; -} - -static void ebt_ulog_packet(struct net *net, unsigned int hooknr, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct ebt_ulog_info *uloginfo, - const char *prefix) -{ - ebt_ulog_packet_msg_t *pm; - size_t size, copy_len; - struct nlmsghdr *nlh; - struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); - unsigned int group = uloginfo->nlgroup; - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[group]; - spinlock_t *lock = &ub->lock; - ktime_t kt; - - if ((uloginfo->cprange == 0) || - (uloginfo->cprange > skb->len + ETH_HLEN)) - copy_len = skb->len + ETH_HLEN; - else - copy_len = uloginfo->cprange; - - size = nlmsg_total_size(sizeof(*pm) + copy_len); - if (size > nlbufsiz) { - pr_debug("Size %Zd needed, but nlbufsiz=%d\n", size, nlbufsiz); - return; - } - - spin_lock_bh(lock); - - if (!ub->skb) { - if (!(ub->skb = ulog_alloc_skb(size))) - goto unlock; - } else if (size > skb_tailroom(ub->skb)) { - ulog_send(ebt, group); - - if (!(ub->skb = ulog_alloc_skb(size))) - goto unlock; - } - - nlh = nlmsg_put(ub->skb, 0, ub->qlen, 0, - size - NLMSG_ALIGN(sizeof(*nlh)), 0); - if (!nlh) { - kfree_skb(ub->skb); - ub->skb = NULL; - goto unlock; - } - ub->qlen++; - - pm = nlmsg_data(nlh); - memset(pm, 0, sizeof(*pm)); - - /* Fill in the ulog data */ - pm->version = EBT_ULOG_VERSION; - kt = ktime_get_real(); - pm->stamp = ktime_to_timeval(kt); - if (ub->qlen == 1) - ub->skb->tstamp = kt; - pm->data_len = copy_len; - pm->mark = skb->mark; - pm->hook = hooknr; - if (uloginfo->prefix != NULL) - strcpy(pm->prefix, uloginfo->prefix); - - if (in) { - strcpy(pm->physindev, in->name); - /* If in isn't a bridge, then physindev==indev */ - if (br_port_exists(in)) - /* rcu_read_lock()ed by nf_hook_slow */ - strcpy(pm->indev, br_port_get_rcu(in)->br->dev->name); - else - strcpy(pm->indev, in->name); - } - - if (out) { - /* If out exists, then out is a bridge port */ - strcpy(pm->physoutdev, out->name); - /* rcu_read_lock()ed by nf_hook_slow */ - strcpy(pm->outdev, br_port_get_rcu(out)->br->dev->name); - } - - if (skb_copy_bits(skb, -ETH_HLEN, pm->data, copy_len) < 0) - BUG(); - - if (ub->qlen > 1) - ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; - - ub->lastnlh = nlh; - - if (ub->qlen >= uloginfo->qthreshold) - ulog_send(ebt, group); - else if (!timer_pending(&ub->timer)) { - ub->timer.expires = jiffies + flushtimeout * HZ / 100; - add_timer(&ub->timer); - } - -unlock: - spin_unlock_bh(lock); -} - -/* this function is registered with the netfilter core */ -static void ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, - const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct nf_loginfo *li, - const char *prefix) -{ - struct ebt_ulog_info loginfo; - - if (!li || li->type != NF_LOG_TYPE_ULOG) { - loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP; - loginfo.cprange = 0; - loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD; - loginfo.prefix[0] = '\0'; - } else { - loginfo.nlgroup = li->u.ulog.group; - loginfo.cprange = li->u.ulog.copy_len; - loginfo.qthreshold = li->u.ulog.qthreshold; - strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); - } - - ebt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); -} - -static unsigned int -ebt_ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - - ebt_ulog_packet(net, par->hooknum, skb, par->in, par->out, - par->targinfo, NULL); - return EBT_CONTINUE; -} - -static int ebt_ulog_tg_check(const struct xt_tgchk_param *par) -{ - struct ebt_ulog_info *uloginfo = par->targinfo; - - if (!par->net->xt.ebt_ulog_warn_deprecated) { - pr_info("ebt_ulog is deprecated and it will be removed soon, " - "use ebt_nflog instead\n"); - par->net->xt.ebt_ulog_warn_deprecated = true; - } - - if (uloginfo->nlgroup > 31) - return -EINVAL; - - uloginfo->prefix[EBT_ULOG_PREFIX_LEN - 1] = '\0'; - - if (uloginfo->qthreshold > EBT_ULOG_MAX_QLEN) - uloginfo->qthreshold = EBT_ULOG_MAX_QLEN; - - return 0; -} - -static struct xt_target ebt_ulog_tg_reg __read_mostly = { - .name = "ulog", - .revision = 0, - .family = NFPROTO_BRIDGE, - .target = ebt_ulog_tg, - .checkentry = ebt_ulog_tg_check, - .targetsize = sizeof(struct ebt_ulog_info), - .me = THIS_MODULE, -}; - -static struct nf_logger ebt_ulog_logger __read_mostly = { - .name = "ebt_ulog", - .logfn = &ebt_log_packet, - .me = THIS_MODULE, -}; - -static int __net_init ebt_ulog_net_init(struct net *net) -{ - int i; - struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); - - struct netlink_kernel_cfg cfg = { - .groups = EBT_ULOG_MAXNLGROUPS, - }; - - /* initialize ulog_buffers */ - for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { - ebt->nlgroup[i] = i; - setup_timer(&ebt->ulog_buffers[i].timer, ulog_timer, - (unsigned long)&ebt->nlgroup[i]); - spin_lock_init(&ebt->ulog_buffers[i].lock); - } - - ebt->ebtulognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg); - if (!ebt->ebtulognl) - return -ENOMEM; - - nf_log_set(net, NFPROTO_BRIDGE, &ebt_ulog_logger); - return 0; -} - -static void __net_exit ebt_ulog_net_fini(struct net *net) -{ - int i; - struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); - - nf_log_unset(net, &ebt_ulog_logger); - for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[i]; - del_timer(&ub->timer); - - if (ub->skb) { - kfree_skb(ub->skb); - ub->skb = NULL; - } - } - netlink_kernel_release(ebt->ebtulognl); -} - -static struct pernet_operations ebt_ulog_net_ops = { - .init = ebt_ulog_net_init, - .exit = ebt_ulog_net_fini, - .id = &ebt_ulog_net_id, - .size = sizeof(struct ebt_ulog_net), -}; - -static int __init ebt_ulog_init(void) -{ - int ret; - - if (nlbufsiz >= 128*1024) { - pr_warn("Netlink buffer has to be <= 128kB," - "please try a smaller nlbufsiz parameter.\n"); - return -EINVAL; - } - - ret = register_pernet_subsys(&ebt_ulog_net_ops); - if (ret) - goto out_pernet; - - ret = xt_register_target(&ebt_ulog_tg_reg); - if (ret) - goto out_target; - - nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger); - - return 0; - -out_target: - unregister_pernet_subsys(&ebt_ulog_net_ops); -out_pernet: - return ret; -} - -static void __exit ebt_ulog_fini(void) -{ - nf_log_unregister(&ebt_ulog_logger); - xt_unregister_target(&ebt_ulog_tg_reg); - unregister_pernet_subsys(&ebt_ulog_net_ops); -} - -module_init(ebt_ulog_init); -module_exit(ebt_ulog_fini); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); -MODULE_DESCRIPTION("Ebtables: Packet logging to netlink using ULOG"); diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c new file mode 100644 index 000000000000..5d9953a90929 --- /dev/null +++ b/net/bridge/netfilter/nf_log_bridge.c @@ -0,0 +1,96 @@ +/* + * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_bridge.h> +#include <linux/ip.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_log.h> + +static void nf_log_bridge_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out, + loginfo, "%s", prefix); + break; + case htons(ETH_P_IPV6): + nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out, + loginfo, "%s", prefix); + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out, + loginfo, "%s", prefix); + break; + } +} + +static struct nf_logger nf_bridge_logger __read_mostly = { + .name = "nf_log_bridge", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_bridge_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_bridge_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); + return 0; +} + +static void __net_exit nf_log_bridge_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_bridge_logger); +} + +static struct pernet_operations nf_log_bridge_net_ops = { + .init = nf_log_bridge_net_init, + .exit = nf_log_bridge_net_exit, +}; + +static int __init nf_log_bridge_init(void) +{ + int ret; + + /* Request to load the real packet loggers. */ + nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG); + nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG); + nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG); + + ret = register_pernet_subsys(&nf_log_bridge_net_ops); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger); + return 0; +} + +static void __exit nf_log_bridge_exit(void) +{ + unregister_pernet_subsys(&nf_log_bridge_net_ops); + nf_log_unregister(&nf_bridge_logger); +} + +module_init(nf_log_bridge_init); +module_exit(nf_log_bridge_exit); + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter bridge packet logging"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index e8437094d15f..43f750e88e19 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -908,8 +908,7 @@ static int caif_release(struct socket *sock) sock->sk = NULL; WARN_ON(IS_ERR(cf_sk->debugfs_socket_dir)); - if (cf_sk->debugfs_socket_dir != NULL) - debugfs_remove_recursive(cf_sk->debugfs_socket_dir); + debugfs_remove_recursive(cf_sk->debugfs_socket_dir); lock_sock(&(cf_sk->sk)); sk->sk_state = CAIF_DISCONNECTED; diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 0f455227da83..f5afda1abc76 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -547,7 +547,6 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) default: pr_err("Unrecognized Control Frame\n"); goto error; - break; } ret = 0; error: diff --git a/net/core/dev.c b/net/core/dev.c index 30eedf677913..e1b7cfaccd65 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -148,6 +148,9 @@ struct list_head ptype_all __read_mostly; /* Taps */ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -1082,6 +1085,7 @@ static int dev_get_valid_name(struct net *net, */ int dev_change_name(struct net_device *dev, const char *newname) { + unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; @@ -1109,10 +1113,17 @@ int dev_change_name(struct net_device *dev, const char *newname) return err; } + if (oldname[0] && !strchr(oldname, '%')) + netdev_info(dev, "renamed from %s\n", oldname); + + old_assign_type = dev->name_assign_type; + dev->name_assign_type = NET_NAME_RENAMED; + rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; write_seqcount_end(&devnet_rename_seq); return ret; } @@ -1141,6 +1152,8 @@ rollback: write_seqcount_begin(&devnet_rename_seq); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; + old_assign_type = NET_NAME_RENAMED; goto rollback; } else { pr_err("%s: name change rollback failed: %d\n", @@ -1207,7 +1220,11 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - call_netdevice_notifiers(NETDEV_CHANGE, dev); + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = 0; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } @@ -2407,8 +2424,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, skb_warn_bad_offload(skb); - if (skb_header_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + err = skb_cow_head(skb, 0); + if (err < 0) return ERR_PTR(err); } @@ -2738,8 +2755,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. - * This permits __QDISC_STATE_RUNNING owner to get the lock more often - * and dequeue packets faster. + * This permits __QDISC___STATE_RUNNING owner to get the lock more + * often and dequeue packets faster. */ contended = qdisc_is_running(q); if (unlikely(contended)) @@ -4089,6 +4106,8 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; + skb->encapsulation = 0; + skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); napi->skb = skb; @@ -4227,9 +4246,8 @@ static int process_backlog(struct napi_struct *napi, int quota) #endif napi->weight = weight_p; local_irq_disable(); - while (work < quota) { + while (1) { struct sk_buff *skb; - unsigned int qlen; while ((skb = __skb_dequeue(&sd->process_queue))) { local_irq_enable(); @@ -4243,24 +4261,24 @@ static int process_backlog(struct napi_struct *napi, int quota) } rps_lock(sd); - qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen) - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); - - if (qlen < quota - work) { + if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, - * and NAPI_STATE_SCHED is the only possible flag set on backlog. - * we can use a plain write instead of clear_bit(), + * and NAPI_STATE_SCHED is the only possible flag set + * on backlog. + * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ list_del(&napi->poll_list); napi->state = 0; + rps_unlock(sd); - quota = work + qlen; + break; } + + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); rps_unlock(sd); } local_irq_enable(); @@ -5432,13 +5450,9 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) */ ret = 0; - if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ + if ((old_flags ^ flags) & IFF_UP) ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); - if (!ret) - dev_set_rx_mode(dev); - } - if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; unsigned int old_flags = dev->flags; @@ -6438,17 +6452,19 @@ void netdev_freemem(struct net_device *dev) /** * alloc_netdev_mqs - allocate network device - * @sizeof_priv: size of private data to allocate space for - * @name: device name format string - * @setup: callback to initialize device - * @txqs: the number of TX subqueues to allocate - * @rxqs: the number of RX subqueues to allocate + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @name_assign_type: origin of device name + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { @@ -6527,6 +6543,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #endif strcpy(dev->name, name); + dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; @@ -6938,12 +6955,14 @@ static int __netdev_printk(const char *level, const struct net_device *dev, if (dev && dev->dev.parent) { r = dev_printk_emit(level[1] - '0', dev->dev.parent, - "%s %s %s: %pV", + "%s %s %s%s: %pV", dev_driver_string(dev->dev.parent), dev_name(dev->dev.parent), - netdev_name(dev), vaf); + netdev_name(dev), netdev_reg_state(dev), + vaf); } else if (dev) { - r = printk("%s%s: %pV", level, netdev_name(dev), vaf); + r = printk("%s%s%s: %pV", level, netdev_name(dev), + netdev_reg_state(dev), vaf); } else { r = printk("%s(NULL net_device): %pV", level, vaf); } @@ -7095,7 +7114,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index e70301eb7a4a..50f9a9db5792 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -289,10 +289,8 @@ static int net_dm_cmd_trace(struct sk_buff *skb, switch (info->genlhdr->cmd) { case NET_DM_CMD_START: return set_all_monitor_traces(TRACE_ON); - break; case NET_DM_CMD_STOP: return set_all_monitor_traces(TRACE_OFF); - break; } return -ENOTSUPP; diff --git a/net/core/dst.c b/net/core/dst.c index 80d6286c8b62..a028409ee438 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -269,6 +269,15 @@ again: } EXPORT_SYMBOL(dst_destroy); +static void dst_destroy_rcu(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); +} + void dst_release(struct dst_entry *dst) { if (dst) { @@ -276,11 +285,8 @@ void dst_release(struct dst_entry *dst) newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); - } + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) + call_rcu(&dst->rcu_head, dst_destroy_rcu); } } EXPORT_SYMBOL(dst_release); diff --git a/net/core/filter.c b/net/core/filter.c index 735fad897496..f3b2d5e9fe5f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -45,54 +45,6 @@ #include <linux/seccomp.h> #include <linux/if_vlan.h> -/* Registers */ -#define BPF_R0 regs[BPF_REG_0] -#define BPF_R1 regs[BPF_REG_1] -#define BPF_R2 regs[BPF_REG_2] -#define BPF_R3 regs[BPF_REG_3] -#define BPF_R4 regs[BPF_REG_4] -#define BPF_R5 regs[BPF_REG_5] -#define BPF_R6 regs[BPF_REG_6] -#define BPF_R7 regs[BPF_REG_7] -#define BPF_R8 regs[BPF_REG_8] -#define BPF_R9 regs[BPF_REG_9] -#define BPF_R10 regs[BPF_REG_10] - -/* Named registers */ -#define DST regs[insn->dst_reg] -#define SRC regs[insn->src_reg] -#define FP regs[BPF_REG_FP] -#define ARG1 regs[BPF_REG_ARG1] -#define CTX regs[BPF_REG_CTX] -#define IMM insn->imm - -/* No hurry in this branch - * - * Exported for the bpf jit load helper. - */ -void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) -{ - u8 *ptr = NULL; - - if (k >= SKF_NET_OFF) - ptr = skb_network_header(skb) + k - SKF_NET_OFF; - else if (k >= SKF_LL_OFF) - ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) - return ptr; - - return NULL; -} - -static inline void *load_pointer(const struct sk_buff *skb, int k, - unsigned int size, void *buffer) -{ - if (k >= 0) - return skb_header_pointer(skb, k, size, buffer); - - return bpf_internal_load_pointer_neg_helper(skb, k, size); -} - /** * sk_filter - run a packet through a socket filter * @sk: sock associated with &sk_buff @@ -135,451 +87,6 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sk_filter); -/* Base function for offset calculation. Needs to go into .text section, - * therefore keeping it non-static as well; will also be used by JITs - * anyway later on, so do not let the compiler omit it. - */ -noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -{ - return 0; -} - -/** - * __sk_run_filter - run a filter on a given context - * @ctx: buffer to run the filter on - * @insn: filter to apply - * - * Decode and apply filter instructions to the skb->data. Return length to - * keep, 0 for none. @ctx is the data we are operating on, @insn is the - * array of filter instructions. - */ -static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) -{ - u64 stack[MAX_BPF_STACK / sizeof(u64)]; - u64 regs[MAX_BPF_REG], tmp; - static const void *jumptable[256] = { - [0 ... 255] = &&default_label, - /* Now overwrite non-defaults ... */ - /* 32 bit ALU operations */ - [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, - [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, - [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, - [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, - [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, - [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, - [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, - [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, - [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, - [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, - [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, - [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, - [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, - [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, - [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, - [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, - [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, - [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, - [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, - [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, - [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, - [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, - [BPF_ALU | BPF_NEG] = &&ALU_NEG, - [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, - [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, - /* 64 bit ALU operations */ - [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, - [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, - [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, - [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, - [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, - [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, - [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, - [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, - [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, - [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, - [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, - [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, - [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, - [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, - [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, - [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, - [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, - [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, - [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, - [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, - [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, - [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, - [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, - [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, - [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, - /* Call instruction */ - [BPF_JMP | BPF_CALL] = &&JMP_CALL, - /* Jumps */ - [BPF_JMP | BPF_JA] = &&JMP_JA, - [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, - [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, - [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, - [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, - [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, - [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, - [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, - [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, - [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, - [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, - [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, - [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, - [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, - [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, - /* Program return */ - [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, - /* Store instructions */ - [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, - [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, - [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, - [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, - [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, - [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, - [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, - [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, - [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, - [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, - /* Load instructions */ - [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, - [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, - [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, - [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, - [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, - [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, - [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, - [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, - [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, - [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, - }; - void *ptr; - int off; - -#define CONT ({ insn++; goto select_insn; }) -#define CONT_JMP ({ insn++; goto select_insn; }) - - FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; - ARG1 = (u64) (unsigned long) ctx; - - /* Registers used in classic BPF programs need to be reset first. */ - regs[BPF_REG_A] = 0; - regs[BPF_REG_X] = 0; - -select_insn: - goto *jumptable[insn->code]; - - /* ALU */ -#define ALU(OPCODE, OP) \ - ALU64_##OPCODE##_X: \ - DST = DST OP SRC; \ - CONT; \ - ALU_##OPCODE##_X: \ - DST = (u32) DST OP (u32) SRC; \ - CONT; \ - ALU64_##OPCODE##_K: \ - DST = DST OP IMM; \ - CONT; \ - ALU_##OPCODE##_K: \ - DST = (u32) DST OP (u32) IMM; \ - CONT; - - ALU(ADD, +) - ALU(SUB, -) - ALU(AND, &) - ALU(OR, |) - ALU(LSH, <<) - ALU(RSH, >>) - ALU(XOR, ^) - ALU(MUL, *) -#undef ALU - ALU_NEG: - DST = (u32) -DST; - CONT; - ALU64_NEG: - DST = -DST; - CONT; - ALU_MOV_X: - DST = (u32) SRC; - CONT; - ALU_MOV_K: - DST = (u32) IMM; - CONT; - ALU64_MOV_X: - DST = SRC; - CONT; - ALU64_MOV_K: - DST = IMM; - CONT; - ALU64_ARSH_X: - (*(s64 *) &DST) >>= SRC; - CONT; - ALU64_ARSH_K: - (*(s64 *) &DST) >>= IMM; - CONT; - ALU64_MOD_X: - if (unlikely(SRC == 0)) - return 0; - tmp = DST; - DST = do_div(tmp, SRC); - CONT; - ALU_MOD_X: - if (unlikely(SRC == 0)) - return 0; - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); - CONT; - ALU64_MOD_K: - tmp = DST; - DST = do_div(tmp, IMM); - CONT; - ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); - CONT; - ALU64_DIV_X: - if (unlikely(SRC == 0)) - return 0; - do_div(DST, SRC); - CONT; - ALU_DIV_X: - if (unlikely(SRC == 0)) - return 0; - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; - CONT; - ALU64_DIV_K: - do_div(DST, IMM); - CONT; - ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; - CONT; - ALU_END_TO_BE: - switch (IMM) { - case 16: - DST = (__force u16) cpu_to_be16(DST); - break; - case 32: - DST = (__force u32) cpu_to_be32(DST); - break; - case 64: - DST = (__force u64) cpu_to_be64(DST); - break; - } - CONT; - ALU_END_TO_LE: - switch (IMM) { - case 16: - DST = (__force u16) cpu_to_le16(DST); - break; - case 32: - DST = (__force u32) cpu_to_le32(DST); - break; - case 64: - DST = (__force u64) cpu_to_le64(DST); - break; - } - CONT; - - /* CALL */ - JMP_CALL: - /* Function call scratches BPF_R1-BPF_R5 registers, - * preserves BPF_R6-BPF_R9, and stores return value - * into BPF_R0. - */ - BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, - BPF_R4, BPF_R5); - CONT; - - /* JMP */ - JMP_JA: - insn += insn->off; - CONT; - JMP_JEQ_X: - if (DST == SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JEQ_K: - if (DST == IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_X: - if (DST != SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_K: - if (DST != IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_X: - if (DST > SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_K: - if (DST > IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_X: - if (DST >= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_K: - if (DST >= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_X: - if (((s64) DST) > ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_K: - if (((s64) DST) > ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_X: - if (((s64) DST) >= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_K: - if (((s64) DST) >= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_X: - if (DST & SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_K: - if (DST & IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_EXIT: - return BPF_R0; - - /* STX and ST and LDX*/ -#define LDST(SIZEOP, SIZE) \ - STX_MEM_##SIZEOP: \ - *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ - CONT; \ - ST_MEM_##SIZEOP: \ - *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ - CONT; \ - LDX_MEM_##SIZEOP: \ - DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ - CONT; - - LDST(B, u8) - LDST(H, u16) - LDST(W, u32) - LDST(DW, u64) -#undef LDST - STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ - atomic_add((u32) SRC, (atomic_t *)(unsigned long) - (DST + insn->off)); - CONT; - STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ - atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) - (DST + insn->off)); - CONT; - LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ - off = IMM; -load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are - * only appearing in the programs where ctx == - * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] - * == BPF_R6, sk_convert_filter() saves it in BPF_R6, - * internal BPF verifier will check that BPF_R6 == - * ctx. - * - * BPF_ABS and BPF_IND are wrappers of function calls, - * so they scratch BPF_R1-BPF_R5 registers, preserve - * BPF_R6-BPF_R9, and store return value into BPF_R0. - * - * Implicit input: - * ctx == skb == BPF_R6 == CTX - * - * Explicit input: - * SRC == any register - * IMM == 32-bit immediate - * - * Output: - * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness - */ - - ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be32(ptr); - CONT; - } - - return 0; - LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ - off = IMM; -load_half: - ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be16(ptr); - CONT; - } - - return 0; - LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ - off = IMM; -load_byte: - ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = *(u8 *)ptr; - CONT; - } - - return 0; - LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_word; - LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_half; - LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ - off = IMM + SRC; - goto load_byte; - - default_label: - /* If we ever reach this, we have a bug somewhere. */ - WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); - return 0; -} - /* Helper to find the offset of pkt_type in sk_buff structure. We want * to make sure its still a 3bit field starting at a byte boundary; * taken from arch/x86/net/bpf_jit_comp.c. @@ -667,9 +174,9 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) } static bool convert_bpf_extensions(struct sock_filter *fp, - struct sock_filter_int **insnp) + struct bpf_insn **insnp) { - struct sock_filter_int *insn = *insnp; + struct bpf_insn *insn = *insnp; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: @@ -819,7 +326,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len); + * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); * sk_convert_filter(old_prog, old_len, new_prog, &new_len); * * User BPF's register A is mapped to our BPF register 6, user BPF @@ -829,10 +336,10 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * ctx == 'struct seccomp_data *'. */ int sk_convert_filter(struct sock_filter *prog, int len, - struct sock_filter_int *new_prog, int *new_len) + struct bpf_insn *new_prog, int *new_len) { int new_flen = 0, pass = 0, target, i; - struct sock_filter_int *new_insn; + struct bpf_insn *new_insn; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; @@ -840,11 +347,11 @@ int sk_convert_filter(struct sock_filter *prog, int len, BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - if (len <= 0 || len >= BPF_MAXINSNS) + if (len <= 0 || len > BPF_MAXINSNS) return -EINVAL; if (new_prog) { - addrs = kzalloc(len * sizeof(*addrs), GFP_KERNEL); + addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; } @@ -858,8 +365,8 @@ do_pass: new_insn++; for (i = 0; i < len; fp++, i++) { - struct sock_filter_int tmp_insns[6] = { }; - struct sock_filter_int *insn = tmp_insns; + struct bpf_insn tmp_insns[6] = { }; + struct bpf_insn *insn = tmp_insns; if (addrs) addrs[i] = new_insn - new_prog; @@ -1094,14 +601,14 @@ err: * a cell if not previously written, and we check all branches to be sure * a malicious user doesn't try to abuse us. */ -static int check_load_and_stores(struct sock_filter *filter, int flen) +static int check_load_and_stores(const struct sock_filter *filter, int flen) { u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; BUILD_BUG_ON(BPF_MEMWORDS > 16); - masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); + masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; @@ -1227,7 +734,7 @@ static bool chk_code_allowed(u16 code_to_probe) * * Returns 0 if the rule set is legal or -EINVAL if not. */ -int sk_chk_filter(struct sock_filter *filter, unsigned int flen) +int sk_chk_filter(const struct sock_filter *filter, unsigned int flen) { bool anc_found; int pc; @@ -1237,7 +744,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { - struct sock_filter *ftest = &filter[pc]; + const struct sock_filter *ftest = &filter[pc]; /* May we actually operate on this code? */ if (!chk_code_allowed(ftest->code)) @@ -1382,7 +889,7 @@ static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp, fp_new = sock_kmalloc(sk, len, GFP_KERNEL); if (fp_new) { *fp_new = *fp; - /* As we're kepping orig_prog in fp_new along, + /* As we're keeping orig_prog in fp_new along, * we need to make sure we're not evicting it * from the old fp. */ @@ -1406,7 +913,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, * representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != - sizeof(struct sock_filter_int)); + sizeof(struct bpf_insn)); /* Conversion cannot happen on overlapping memory areas, * so we need to keep the user BPF around until the 2nd @@ -1438,7 +945,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, fp->len = new_len; - /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */ + /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len); if (err) /* 2nd sk_convert_filter() can fail only if it fails @@ -1464,33 +971,6 @@ out_err: return ERR_PTR(err); } -void __weak bpf_int_jit_compile(struct sk_filter *prog) -{ -} - -/** - * sk_filter_select_runtime - select execution runtime for BPF program - * @fp: sk_filter populated with internal BPF program - * - * try to JIT internal BPF program, if JIT is not available select interpreter - * BPF program will be executed via SK_RUN_FILTER() macro - */ -void sk_filter_select_runtime(struct sk_filter *fp) -{ - fp->bpf_func = (void *) __sk_run_filter; - - /* Probe if internal BPF can be JITed */ - bpf_int_jit_compile(fp); -} -EXPORT_SYMBOL_GPL(sk_filter_select_runtime); - -/* free internal BPF program */ -void sk_filter_free(struct sk_filter *fp) -{ - bpf_jit_free(fp); -} -EXPORT_SYMBOL_GPL(sk_filter_free); - static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, struct sock *sk) { @@ -1524,8 +1004,8 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, /** * sk_unattached_filter_create - create an unattached filter - * @fprog: the filter program * @pfp: the unattached filter that is created + * @fprog: the filter program * * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 107ed12a5323..5f362c1d0332 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -80,6 +80,8 @@ ip: case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; + __be32 flow_label; + ipv6: iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); if (!iph) @@ -89,6 +91,21 @@ ipv6: flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); nhoff += sizeof(struct ipv6hdr); + + flow_label = ip6_flowlabel(iph); + if (flow_label) { + /* Awesome, IPv6 packet has a flow label so we can + * use that to represent the ports without any + * further dissection. + */ + flow->n_proto = proto; + flow->ip_proto = ip_proto; + flow->ports = flow_label; + flow->thoff = (u16)nhoff; + + return true; + } + break; } case htons(ETH_P_8021AD): @@ -175,6 +192,7 @@ ipv6: break; } + flow->n_proto = proto; flow->ip_proto = ip_proto; flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); flow->thoff = (u16) nhoff; @@ -195,12 +213,33 @@ static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) return jhash_3words(a, b, c, hashrnd); } -static __always_inline u32 __flow_hash_1word(u32 a) +static inline u32 __flow_hash_from_keys(struct flow_keys *keys) { - __flow_hash_secret_init(); - return jhash_1word(a, hashrnd); + u32 hash; + + /* get a consistent hash (same value on both flow directions) */ + if (((__force u32)keys->dst < (__force u32)keys->src) || + (((__force u32)keys->dst == (__force u32)keys->src) && + ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) { + swap(keys->dst, keys->src); + swap(keys->port16[0], keys->port16[1]); + } + + hash = __flow_hash_3words((__force u32)keys->dst, + (__force u32)keys->src, + (__force u32)keys->ports); + if (!hash) + hash = 1; + + return hash; } +u32 flow_hash_from_keys(struct flow_keys *keys) +{ + return __flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(flow_hash_from_keys); + /* * __skb_get_hash: calculate a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value @@ -210,7 +249,6 @@ static __always_inline u32 __flow_hash_1word(u32 a) void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; - u32 hash; if (!skb_flow_dissect(skb, &keys)) return; @@ -218,21 +256,9 @@ void __skb_get_hash(struct sk_buff *skb) if (keys.ports) skb->l4_hash = 1; - /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys.dst < (__force u32)keys.src) || - (((__force u32)keys.dst == (__force u32)keys.src) && - ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { - swap(keys.dst, keys.src); - swap(keys.port16[0], keys.port16[1]); - } - - hash = __flow_hash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports); - if (!hash) - hash = 1; + skb->sw_hash = 1; - skb->hash = hash; + skb->hash = __flow_hash_from_keys(&keys); } EXPORT_SYMBOL(__skb_get_hash); @@ -240,7 +266,7 @@ EXPORT_SYMBOL(__skb_get_hash); * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ -u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, +u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, unsigned int num_tx_queues) { u32 hash; @@ -260,13 +286,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, qcount = dev->tc_to_txq[tc].count; } - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol; - hash = __flow_hash_1word(hash); - - return (u16) (((u64) hash * qcount) >> 32) + qoffset; + return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset; } EXPORT_SYMBOL(__skb_tx_hash); @@ -338,17 +358,10 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) if (map) { if (map->len == 1) queue_index = map->queues[0]; - else { - u32 hash; - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol ^ - skb->hash; - hash = __flow_hash_1word(hash); + else queue_index = map->queues[ - ((u64)hash * map->len) >> 32]; - } + ((u64)skb_get_hash(skb) * map->len) >> 32]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } diff --git a/net/core/iovec.c b/net/core/iovec.c index b61869429f4c..827dd6beb49c 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -75,61 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a } /* - * Copy kernel to iovec. Returns -EFAULT on error. - */ - -int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, - int offset, int len) -{ - int copy; - for (; len > 0; ++iov) { - /* Skip over the finished iovecs */ - if (unlikely(offset >= iov->iov_len)) { - offset -= iov->iov_len; - continue; - } - copy = min_t(unsigned int, iov->iov_len - offset, len); - if (copy_to_user(iov->iov_base + offset, kdata, copy)) - return -EFAULT; - offset = 0; - kdata += copy; - len -= copy; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_toiovecend); - -/* - * Copy iovec to kernel. Returns -EFAULT on error. - */ - -int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, - int offset, int len) -{ - /* Skip over the finished iovecs */ - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - } - - while (len > 0) { - u8 __user *base = iov->iov_base + offset; - int copy = min_t(unsigned int, len, iov->iov_len - offset); - - offset = 0; - if (copy_from_user(kdata, base, copy)) - return -EFAULT; - len -= copy; - kdata += copy; - iov++; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_fromiovecend); - -/* * And now for the all-in-one: copy and checksum from a user iovec * directly to a datagram * Calls to csum_partial but the last must be in 32 bit chunks diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 32d872eec7f5..559890b0f0a2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3059,11 +3059,12 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); } else { + struct neigh_table *tbl = p->tbl; dev_name_source = "default"; - t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1); - t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1; - t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2; - t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3; + t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; + t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; + t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; + t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } if (handler) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1cac29ebb05b..9dd06699b09c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -43,12 +43,12 @@ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, ssize_t (*format)(const struct net_device *, char *)) { - struct net_device *net = to_net_dev(dev); + struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; read_lock(&dev_base_lock); - if (dev_isalive(net)) - ret = (*format)(net, buf); + if (dev_isalive(ndev)) + ret = (*format)(ndev, buf); read_unlock(&dev_base_lock); return ret; @@ -56,9 +56,9 @@ static ssize_t netdev_show(const struct device *dev, /* generate a show function for simple field */ #define NETDEVICE_SHOW(field, format_string) \ -static ssize_t format_##field(const struct net_device *net, char *buf) \ +static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sprintf(buf, format_string, net->field); \ + return sprintf(buf, format_string, dev->field); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -112,16 +112,35 @@ NETDEVICE_SHOW_RO(ifindex, fmt_dec); NETDEVICE_SHOW_RO(type, fmt_dec); NETDEVICE_SHOW_RO(link_mode, fmt_dec); +static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) +{ + return sprintf(buf, fmt_dec, dev->name_assign_type); +} + +static ssize_t name_assign_type_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (ndev->name_assign_type != NET_NAME_UNKNOWN) + ret = netdev_show(dev, attr, buf, format_name_assign_type); + + return ret; +} +static DEVICE_ATTR_RO(name_assign_type); + /* use same locking rules as GIFHWADDR ioctl's */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct net_device *net = to_net_dev(dev); + struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; read_lock(&dev_base_lock); - if (dev_isalive(net)) - ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len); + if (dev_isalive(ndev)) + ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); read_unlock(&dev_base_lock); return ret; } @@ -130,18 +149,18 @@ static DEVICE_ATTR_RO(address); static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct net_device *net = to_net_dev(dev); - if (dev_isalive(net)) - return sysfs_format_mac(buf, net->broadcast, net->addr_len); + struct net_device *ndev = to_net_dev(dev); + if (dev_isalive(ndev)) + return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); return -EINVAL; } static DEVICE_ATTR_RO(broadcast); -static int change_carrier(struct net_device *net, unsigned long new_carrier) +static int change_carrier(struct net_device *dev, unsigned long new_carrier) { - if (!netif_running(net)) + if (!netif_running(dev)) return -EINVAL; - return dev_change_carrier(net, (bool) new_carrier); + return dev_change_carrier(dev, (bool) new_carrier); } static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, @@ -265,9 +284,9 @@ static DEVICE_ATTR_RO(carrier_changes); /* read-write attributes */ -static int change_mtu(struct net_device *net, unsigned long new_mtu) +static int change_mtu(struct net_device *dev, unsigned long new_mtu) { - return dev_set_mtu(net, (int) new_mtu); + return dev_set_mtu(dev, (int) new_mtu); } static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, @@ -277,9 +296,9 @@ static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, } NETDEVICE_SHOW_RW(mtu, fmt_dec); -static int change_flags(struct net_device *net, unsigned long new_flags) +static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(net, (unsigned int) new_flags); + return dev_change_flags(dev, (unsigned int) new_flags); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, @@ -289,9 +308,9 @@ static ssize_t flags_store(struct device *dev, struct device_attribute *attr, } NETDEVICE_SHOW_RW(flags, fmt_hex); -static int change_tx_queue_len(struct net_device *net, unsigned long new_len) +static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) { - net->tx_queue_len = new_len; + dev->tx_queue_len = new_len; return 0; } @@ -344,9 +363,9 @@ static ssize_t ifalias_show(struct device *dev, } static DEVICE_ATTR_RW(ifalias); -static int change_group(struct net_device *net, unsigned long new_group) +static int change_group(struct net_device *dev, unsigned long new_group) { - dev_set_group(net, (int) new_group); + dev_set_group(dev, (int) new_group); return 0; } @@ -387,6 +406,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_dev_port.attr, &dev_attr_iflink.attr, &dev_attr_ifindex.attr, + &dev_attr_name_assign_type.attr, &dev_attr_addr_assign_type.attr, &dev_attr_addr_len.attr, &dev_attr_link_mode.attr, @@ -776,20 +796,20 @@ static struct kobj_type rx_queue_ktype = { .namespace = rx_queue_namespace }; -static int rx_queue_add_kobject(struct net_device *net, int index) +static int rx_queue_add_kobject(struct net_device *dev, int index) { - struct netdev_rx_queue *queue = net->_rx + index; + struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error = 0; - kobj->kset = net->queues_kset; + kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); if (error) goto exit; - if (net->sysfs_rx_queue_group) { - error = sysfs_create_group(kobj, net->sysfs_rx_queue_group); + if (dev->sysfs_rx_queue_group) { + error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) goto exit; } @@ -805,18 +825,18 @@ exit: #endif /* CONFIG_SYSFS */ int -net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; #ifndef CONFIG_RPS - if (!net->sysfs_rx_queue_group) + if (!dev->sysfs_rx_queue_group) return 0; #endif for (i = old_num; i < new_num; i++) { - error = rx_queue_add_kobject(net, i); + error = rx_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; @@ -824,10 +844,10 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } while (--i >= new_num) { - if (net->sysfs_rx_queue_group) - sysfs_remove_group(&net->_rx[i].kobj, - net->sysfs_rx_queue_group); - kobject_put(&net->_rx[i].kobj); + if (dev->sysfs_rx_queue_group) + sysfs_remove_group(&dev->_rx[i].kobj, + dev->sysfs_rx_queue_group); + kobject_put(&dev->_rx[i].kobj); } return error; @@ -1135,13 +1155,13 @@ static struct kobj_type netdev_queue_ktype = { .namespace = netdev_queue_namespace, }; -static int netdev_queue_add_kobject(struct net_device *net, int index) +static int netdev_queue_add_kobject(struct net_device *dev, int index) { - struct netdev_queue *queue = net->_tx + index; + struct netdev_queue *queue = dev->_tx + index; struct kobject *kobj = &queue->kobj; int error = 0; - kobj->kset = net->queues_kset; + kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); if (error) @@ -1164,14 +1184,14 @@ exit: #endif /* CONFIG_SYSFS */ int -netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; for (i = old_num; i < new_num; i++) { - error = netdev_queue_add_kobject(net, i); + error = netdev_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; @@ -1179,7 +1199,7 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } while (--i >= new_num) { - struct netdev_queue *queue = net->_tx + i; + struct netdev_queue *queue = dev->_tx + i; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1193,25 +1213,25 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) #endif /* CONFIG_SYSFS */ } -static int register_queue_kobjects(struct net_device *net) +static int register_queue_kobjects(struct net_device *dev) { int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS - net->queues_kset = kset_create_and_add("queues", - NULL, &net->dev.kobj); - if (!net->queues_kset) + dev->queues_kset = kset_create_and_add("queues", + NULL, &dev->dev.kobj); + if (!dev->queues_kset) return -ENOMEM; - real_rx = net->real_num_rx_queues; + real_rx = dev->real_num_rx_queues; #endif - real_tx = net->real_num_tx_queues; + real_tx = dev->real_num_tx_queues; - error = net_rx_queue_update_kobjects(net, 0, real_rx); + error = net_rx_queue_update_kobjects(dev, 0, real_rx); if (error) goto error; rxq = real_rx; - error = netdev_queue_update_kobjects(net, 0, real_tx); + error = netdev_queue_update_kobjects(dev, 0, real_tx); if (error) goto error; txq = real_tx; @@ -1219,24 +1239,24 @@ static int register_queue_kobjects(struct net_device *net) return 0; error: - netdev_queue_update_kobjects(net, txq, 0); - net_rx_queue_update_kobjects(net, rxq, 0); + netdev_queue_update_kobjects(dev, txq, 0); + net_rx_queue_update_kobjects(dev, rxq, 0); return error; } -static void remove_queue_kobjects(struct net_device *net) +static void remove_queue_kobjects(struct net_device *dev) { int real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS - real_rx = net->real_num_rx_queues; + real_rx = dev->real_num_rx_queues; #endif - real_tx = net->real_num_tx_queues; + real_tx = dev->real_num_tx_queues; - net_rx_queue_update_kobjects(net, real_rx, 0); - netdev_queue_update_kobjects(net, real_tx, 0); + net_rx_queue_update_kobjects(dev, real_rx, 0); + netdev_queue_update_kobjects(dev, real_tx, 0); #ifdef CONFIG_SYSFS - kset_unregister(net->queues_kset); + kset_unregister(dev->queues_kset); #endif } @@ -1329,13 +1349,13 @@ static struct class net_class = { /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. */ -void netdev_unregister_kobject(struct net_device * net) +void netdev_unregister_kobject(struct net_device *ndev) { - struct device *dev = &(net->dev); + struct device *dev = &(ndev->dev); kobject_get(&dev->kobj); - remove_queue_kobjects(net); + remove_queue_kobjects(ndev); pm_runtime_set_memalloc_noio(dev, false); @@ -1343,18 +1363,18 @@ void netdev_unregister_kobject(struct net_device * net) } /* Create sysfs entries for network device. */ -int netdev_register_kobject(struct net_device *net) +int netdev_register_kobject(struct net_device *ndev) { - struct device *dev = &(net->dev); - const struct attribute_group **groups = net->sysfs_groups; + struct device *dev = &(ndev->dev); + const struct attribute_group **groups = ndev->sysfs_groups; int error = 0; device_initialize(dev); dev->class = &net_class; - dev->platform_data = net; + dev->platform_data = ndev; dev->groups = groups; - dev_set_name(dev, "%s", net->name); + dev_set_name(dev, "%s", ndev->name); #ifdef CONFIG_SYSFS /* Allow for a device specific group */ @@ -1364,10 +1384,10 @@ int netdev_register_kobject(struct net_device *net) *groups++ = &netstat_group; #if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) - if (net->ieee80211_ptr) + if (ndev->ieee80211_ptr) *groups++ = &wireless_group; #if IS_ENABLED(CONFIG_WIRELESS_EXT) - else if (net->wireless_handlers) + else if (ndev->wireless_handlers) *groups++ = &wireless_group; #endif #endif @@ -1377,7 +1397,7 @@ int netdev_register_kobject(struct net_device *net) if (error) return error; - error = register_queue_kobjects(net); + error = register_queue_kobjects(ndev); if (error) { device_del(dev); return error; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e33937fb32a0..907fb5e36c02 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -822,7 +822,8 @@ void __netpoll_cleanup(struct netpoll *np) RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); - } + } else + RCU_INIT_POINTER(np->dev->npinfo, NULL); } EXPORT_SYMBOL_GPL(__netpoll_cleanup); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index fc17a9d309ac..8b849ddfef2e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -69,8 +69,9 @@ * for running devices in the if_list and sends packets until count is 0 it * also the thread checks the thread->control which is used for inter-process * communication. controlling process "posts" operations to the threads this - * way. The if_lock should be possible to remove when add/rem_device is merged - * into this too. + * way. + * The if_list is RCU protected, and the if_lock remains to protect updating + * of if_list, from "add_device" as it invoked from userspace (via proc write). * * By design there should only be *one* "controlling" process. In practice * multiple write accesses gives unpredictable result. Understood by "write" @@ -208,7 +209,7 @@ #define T_REMDEVALL (1<<2) /* Remove all devs */ #define T_REMDEV (1<<3) /* Remove one dev */ -/* If lock -- can be removed after some work */ +/* If lock -- protects updating of if_list */ #define if_lock(t) spin_lock(&(t->if_lock)); #define if_unlock(t) spin_unlock(&(t->if_lock)); @@ -241,6 +242,7 @@ struct pktgen_dev { struct proc_dir_entry *entry; /* proc file */ struct pktgen_thread *pg_thread;/* the owner */ struct list_head list; /* chaining in the thread's run-queue */ + struct rcu_head rcu; /* freed by RCU */ int running; /* if false, the test will stop */ @@ -802,7 +804,6 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) case '\t': case ' ': goto done_str; - break; default: break; } @@ -1737,14 +1738,14 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) seq_puts(seq, "Running: "); - if_lock(t); - list_for_each_entry(pkt_dev, &t->if_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) if (pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odevname); seq_puts(seq, "\nStopped: "); - list_for_each_entry(pkt_dev, &t->if_list, list) + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) if (!pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odevname); @@ -1753,7 +1754,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) else seq_puts(seq, "\nResult: NA\n"); - if_unlock(t); + rcu_read_unlock(); return 0; } @@ -1878,10 +1879,8 @@ static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn, pkt_dev = pktgen_find_dev(t, ifname, exact); if (pkt_dev) { if (remove) { - if_lock(t); pkt_dev->removal_mark = 1; t->control |= T_REMDEV; - if_unlock(t); } break; } @@ -1931,7 +1930,8 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d list_for_each_entry(t, &pn->pktgen_threads, th_list) { struct pktgen_dev *pkt_dev; - list_for_each_entry(pkt_dev, &t->if_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { if (pkt_dev->odev != dev) continue; @@ -1946,6 +1946,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d dev->name); break; } + rcu_read_unlock(); } } @@ -2997,8 +2998,8 @@ static void pktgen_run(struct pktgen_thread *t) func_enter(); - if_lock(t); - list_for_each_entry(pkt_dev, &t->if_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { /* * setup odev and create initial packet. @@ -3007,18 +3008,18 @@ static void pktgen_run(struct pktgen_thread *t) if (pkt_dev->odev) { pktgen_clear_counters(pkt_dev); - pkt_dev->running = 1; /* Cranke yeself! */ pkt_dev->skb = NULL; pkt_dev->started_at = pkt_dev->next_tx = ktime_get(); set_pkt_overhead(pkt_dev); strcpy(pkt_dev->result, "Starting"); + pkt_dev->running = 1; /* Cranke yeself! */ started++; } else strcpy(pkt_dev->result, "Error starting"); } - if_unlock(t); + rcu_read_unlock(); if (started) t->control &= ~(T_STOP); } @@ -3041,27 +3042,25 @@ static int thread_is_running(const struct pktgen_thread *t) { const struct pktgen_dev *pkt_dev; - list_for_each_entry(pkt_dev, &t->if_list, list) - if (pkt_dev->running) + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) + if (pkt_dev->running) { + rcu_read_unlock(); return 1; + } + rcu_read_unlock(); return 0; } static int pktgen_wait_thread_run(struct pktgen_thread *t) { - if_lock(t); - while (thread_is_running(t)) { - if_unlock(t); - msleep_interruptible(100); if (signal_pending(current)) goto signal; - if_lock(t); } - if_unlock(t); return 1; signal: return 0; @@ -3166,10 +3165,10 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev) return -EINVAL; } + pkt_dev->running = 0; kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; pkt_dev->stopped_at = ktime_get(); - pkt_dev->running = 0; show_results(pkt_dev, nr_frags); @@ -3180,9 +3179,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t) { struct pktgen_dev *pkt_dev, *best = NULL; - if_lock(t); - - list_for_each_entry(pkt_dev, &t->if_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { if (!pkt_dev->running) continue; if (best == NULL) @@ -3190,7 +3188,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t) else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0) best = pkt_dev; } - if_unlock(t); + rcu_read_unlock(); + return best; } @@ -3200,13 +3199,13 @@ static void pktgen_stop(struct pktgen_thread *t) func_enter(); - if_lock(t); + rcu_read_lock(); - list_for_each_entry(pkt_dev, &t->if_list, list) { + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { pktgen_stop_device(pkt_dev); } - if_unlock(t); + rcu_read_unlock(); } /* @@ -3220,8 +3219,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) func_enter(); - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3235,8 +3232,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) break; } - - if_unlock(t); } static void pktgen_rem_all_ifs(struct pktgen_thread *t) @@ -3248,8 +3243,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) /* Remove all devices, free mem */ - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3258,8 +3251,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) pktgen_remove_device(t, cur); } - - if_unlock(t); } static void pktgen_rem_thread(struct pktgen_thread *t) @@ -3407,10 +3398,10 @@ static int pktgen_thread_worker(void *arg) pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current)); - set_current_state(TASK_INTERRUPTIBLE); - set_freezable(); + __set_current_state(TASK_RUNNING); + while (!kthread_should_stop()) { pkt_dev = next_to_run(t); @@ -3424,8 +3415,6 @@ static int pktgen_thread_worker(void *arg) continue; } - __set_current_state(TASK_RUNNING); - if (likely(pkt_dev)) { pktgen_xmit(pkt_dev); @@ -3456,9 +3445,8 @@ static int pktgen_thread_worker(void *arg) } try_to_freeze(); - - set_current_state(TASK_INTERRUPTIBLE); } + set_current_state(TASK_INTERRUPTIBLE); pr_debug("%s stopping all device\n", t->tsk->comm); pktgen_stop(t); @@ -3485,8 +3473,8 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, struct pktgen_dev *p, *pkt_dev = NULL; size_t len = strlen(ifname); - if_lock(t); - list_for_each_entry(p, &t->if_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(p, &t->if_list, list) if (strncmp(p->odevname, ifname, len) == 0) { if (p->odevname[len]) { if (exact || p->odevname[len] != '@') @@ -3496,7 +3484,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, break; } - if_unlock(t); + rcu_read_unlock(); pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev); return pkt_dev; } @@ -3510,6 +3498,12 @@ static int add_dev_to_thread(struct pktgen_thread *t, { int rv = 0; + /* This function cannot be called concurrently, as its called + * under pktgen_thread_lock mutex, but it can run from + * userspace on another CPU than the kthread. The if_lock() + * is used here to sync with concurrent instances of + * _rem_dev_from_if_list() invoked via kthread, which is also + * updating the if_list */ if_lock(t); if (pkt_dev->pg_thread) { @@ -3518,9 +3512,9 @@ static int add_dev_to_thread(struct pktgen_thread *t, goto out; } - list_add(&pkt_dev->list, &t->if_list); - pkt_dev->pg_thread = t; pkt_dev->running = 0; + pkt_dev->pg_thread = t; + list_add_rcu(&pkt_dev->list, &t->if_list); out: if_unlock(t); @@ -3675,11 +3669,13 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t, struct list_head *q, *n; struct pktgen_dev *p; + if_lock(t); list_for_each_safe(q, n, &t->if_list) { p = list_entry(q, struct pktgen_dev, list); if (p == pkt_dev) - list_del(&p->list); + list_del_rcu(&p->list); } + if_unlock(t); } static int pktgen_remove_device(struct pktgen_thread *t, @@ -3699,20 +3695,22 @@ static int pktgen_remove_device(struct pktgen_thread *t, pkt_dev->odev = NULL; } - /* And update the thread if_list */ - - _rem_dev_from_if_list(t, pkt_dev); - + /* Remove proc before if_list entry, because add_device uses + * list to determine if interface already exist, avoid race + * with proc_create_data() */ if (pkt_dev->entry) proc_remove(pkt_dev->entry); + /* And update the thread if_list */ + _rem_dev_from_if_list(t, pkt_dev); + #ifdef CONFIG_XFRM free_SAs(pkt_dev); #endif vfree(pkt_dev->flows); if (pkt_dev->page) put_page(pkt_dev->page); - kfree(pkt_dev); + kfree_rcu(pkt_dev, rcu); return 0; } @@ -3812,6 +3810,7 @@ static void __exit pg_cleanup(void) { unregister_netdevice_notifier(&pktgen_notifier_block); unregister_pernet_subsys(&pg_net_ops); + /* Don't need rcu_barrier() due to use of kfree_rcu() */ } module_init(pg_init); diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index d3027a73fd4b..12ab7b4be609 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -52,14 +52,43 @@ * test_8021q: * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ? * ldh [16] ; load inner type - * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ? + * jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ? * ldb [18] ; load payload * and #0x8 ; as we don't have ports here, test * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these * ldh [18] ; reload payload * and #0xf ; mask PTP_CLASS_VMASK - * or #0x40 ; PTP_CLASS_V2_VLAN + * or #0x70 ; PTP_CLASS_VLAN|PTP_CLASS_L2 + * ret a ; return PTP class + * + * ; PTP over UDP over IPv4 over 802.1Q over Ethernet + * test_8021q_ipv4: + * jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ? + * ldb [27] ; load proto + * jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ? + * ldh [24] ; load frag offset field + * jset #0x1fff, drop_8021q_ipv4; don't allow fragments + * ldxb 4*([18]&0xf) ; load IP header len + * ldh [x + 20] ; load UDP dst port + * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ? + * ldh [x + 26] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x50 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4 + * ret a ; return PTP class + * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over UDP over IPv6 over 802.1Q over Ethernet + * test_8021q_ipv6: + * jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ? + * ldb [24] ; load proto + * jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ? + * ldh [60] ; load UDP dst port + * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ? + * ldh [66] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x60 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6 * ret a ; return PTP class + * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE * * ; PTP over Ethernet * test_ieee1588: @@ -113,16 +142,39 @@ void __init ptp_classifier_init(void) { 0x44, 0, 0, 0x00000020 }, { 0x16, 0, 0, 0x00000000 }, { 0x06, 0, 0, 0x00000000 }, - { 0x15, 0, 9, 0x00008100 }, + { 0x15, 0, 32, 0x00008100 }, { 0x28, 0, 0, 0x00000010 }, - { 0x15, 0, 15, 0x000088f7 }, + { 0x15, 0, 7, 0x000088f7 }, { 0x30, 0, 0, 0x00000012 }, { 0x54, 0, 0, 0x00000008 }, - { 0x15, 0, 12, 0x00000000 }, + { 0x15, 0, 35, 0x00000000 }, { 0x28, 0, 0, 0x00000012 }, { 0x54, 0, 0, 0x0000000f }, - { 0x44, 0, 0, 0x00000040 }, + { 0x44, 0, 0, 0x00000070 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x0000001b }, + { 0x15, 0, 9, 0x00000011 }, + { 0x28, 0, 0, 0x00000018 }, + { 0x45, 7, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x00000012 }, + { 0x48, 0, 0, 0x00000014 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x48, 0, 0, 0x0000001a }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000050 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 8, 0x000086dd }, + { 0x30, 0, 0, 0x00000018 }, + { 0x15, 0, 6, 0x00000011 }, + { 0x28, 0, 0, 0x0000003c }, + { 0x15, 0, 4, 0x0000013f }, + { 0x28, 0, 0, 0x00000042 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000060 }, { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, { 0x15, 0, 7, 0x000088f7 }, { 0x30, 0, 0, 0x0000000e }, { 0x54, 0, 0, 0x00000008 }, diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 467f326126e0..04db318e6218 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -41,27 +41,27 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, unsigned int nr_table_entries) { size_t lopt_size = sizeof(struct listen_sock); - struct listen_sock *lopt; + struct listen_sock *lopt = NULL; nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); nr_table_entries = max_t(u32, nr_table_entries, 8); nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); - if (lopt_size > PAGE_SIZE) + + if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + lopt = kzalloc(lopt_size, GFP_KERNEL | + __GFP_NOWARN | + __GFP_NORETRY); + if (!lopt) lopt = vzalloc(lopt_size); - else - lopt = kzalloc(lopt_size, GFP_KERNEL); - if (lopt == NULL) + if (!lopt) return -ENOMEM; - for (lopt->max_qlen_log = 3; - (1 << lopt->max_qlen_log) < nr_table_entries; - lopt->max_qlen_log++); - get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); rwlock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; lopt->nr_table_entries = nr_table_entries; + lopt->max_qlen_log = ilog2(nr_table_entries); write_lock_bh(&queue->syn_wait_lock); queue->listen_opt = lopt; @@ -72,22 +72,8 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, void __reqsk_queue_destroy(struct request_sock_queue *queue) { - struct listen_sock *lopt; - size_t lopt_size; - - /* - * this is an error recovery path only - * no locking needed and the lopt is not NULL - */ - - lopt = queue->listen_opt; - lopt_size = sizeof(struct listen_sock) + - lopt->nr_table_entries * sizeof(struct request_sock *); - - if (lopt_size > PAGE_SIZE) - vfree(lopt); - else - kfree(lopt); + /* This is an error recovery path only, no locking needed */ + kvfree(queue->listen_opt); } static inline struct listen_sock *reqsk_queue_yank_listen_sk( @@ -107,8 +93,6 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) { /* make all the listen_opt local to us */ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - size_t lopt_size = sizeof(struct listen_sock) + - lopt->nr_table_entries * sizeof(struct request_sock *); if (lopt->qlen != 0) { unsigned int i; @@ -125,10 +109,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) } WARN_ON(lopt->qlen != 0); - if (lopt_size > PAGE_SIZE) - vfree(lopt); - else - kfree(lopt); + kvfree(lopt); } /* diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1063996f8317..8d39071f32d7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -299,7 +299,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if (rtnl_link_ops_get(ops->kind)) return -EEXIST; - if (!ops->dellink) + /* The check for setup is here because if ops + * does not have that filled up, it is not possible + * to use the ops for creating device. So do not + * fill up dellink as well. That disables rtnl_dellink. + */ + if (ops->setup && !ops->dellink) ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); @@ -1777,7 +1782,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) return -ENODEV; ops = dev->rtnl_link_ops; - if (!ops) + if (!ops || !ops->dellink) return -EOPNOTSUPP; ops->dellink(dev, &list_kill); @@ -1805,7 +1810,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) EXPORT_SYMBOL(rtnl_configure_link); struct net_device *rtnl_create_link(struct net *net, - char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) + char *ifname, unsigned char name_assign_type, + const struct rtnl_link_ops *ops, struct nlattr *tb[]) { int err; struct net_device *dev; @@ -1823,8 +1829,8 @@ struct net_device *rtnl_create_link(struct net *net, num_rx_queues = ops->get_num_rx_queues(); err = -ENOMEM; - dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup, - num_tx_queues, num_rx_queues); + dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, + ops->setup, num_tx_queues, num_rx_queues); if (!dev) goto err; @@ -1889,6 +1895,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct nlattr *linkinfo[IFLA_INFO_MAX+1]; + unsigned char name_assign_type = NET_NAME_USER; int err; #ifdef CONFIG_MODULES @@ -2038,14 +2045,19 @@ replay: return -EOPNOTSUPP; } - if (!ifname[0]) + if (!ops->setup) + return -EOPNOTSUPP; + + if (!ifname[0]) { snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } dest_net = rtnl_link_get_net(net, tb); if (IS_ERR(dest_net)) return PTR_ERR(dest_net); - dev = rtnl_create_link(dest_net, ifname, ops, tb); + dev = rtnl_create_link(dest_net, ifname, name_assign_type, ops, tb); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; @@ -2380,22 +2392,20 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, struct net_device *dev, const unsigned char *addr) { - int err = -EOPNOTSUPP; + int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { pr_info("%s: FDB only supports static addresses\n", dev->name); - return -EINVAL; + return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); - else - err = -EINVAL; return err; } @@ -2509,6 +2519,7 @@ skip: int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, + struct net_device *filter_dev, int idx) { int err; @@ -2526,28 +2537,72 @@ EXPORT_SYMBOL(ndo_dflt_fdb_dump); static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { - int idx = 0; - struct net *net = sock_net(skb->sk); struct net_device *dev; + struct nlattr *tb[IFLA_MAX+1]; + struct net_device *bdev = NULL; + struct net_device *br_dev = NULL; + const struct net_device_ops *ops = NULL; + const struct net_device_ops *cops = NULL; + struct ifinfomsg *ifm = nlmsg_data(cb->nlh); + struct net *net = sock_net(skb->sk); + int brport_idx = 0; + int br_idx = 0; + int idx = 0; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - if (dev->priv_flags & IFF_BRIDGE_PORT) { - struct net_device *br_dev; - const struct net_device_ops *ops; + if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, + ifla_policy) == 0) { + if (tb[IFLA_MASTER]) + br_idx = nla_get_u32(tb[IFLA_MASTER]); + } + + brport_idx = ifm->ifi_index; - br_dev = netdev_master_upper_dev_get(dev); - ops = br_dev->netdev_ops; - if (ops->ndo_fdb_dump) - idx = ops->ndo_fdb_dump(skb, cb, dev, idx); + if (br_idx) { + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) + return -ENODEV; + + ops = br_dev->netdev_ops; + bdev = br_dev; + } + + for_each_netdev(net, dev) { + if (brport_idx && (dev->ifindex != brport_idx)) + continue; + + if (!br_idx) { /* user did not specify a specific bridge */ + if (dev->priv_flags & IFF_BRIDGE_PORT) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; + } + + bdev = dev; + } else { + if (dev != br_dev && + !(dev->priv_flags & IFF_BRIDGE_PORT)) + continue; + + if (br_dev != netdev_master_upper_dev_get(dev) && + !(dev->priv_flags & IFF_EBRIDGE)) + continue; + + bdev = br_dev; + cops = ops; + } + + if (dev->priv_flags & IFF_BRIDGE_PORT) { + if (cops && cops->ndo_fdb_dump) + idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, + idx); } + idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); if (dev->netdev_ops->ndo_fdb_dump) - idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx); - else - idx = ndo_dflt_fdb_dump(skb, cb, dev, idx); + idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, bdev, dev, + idx); + + cops = NULL; } - rcu_read_unlock(); cb->args[0] = idx; return skb->len; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9cd5344fad73..c1a33033cbe2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2993,7 +2993,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, skb_put(nskb, len), len, 0); SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + offset; + skb_headroom(nskb) + doffset; continue; } diff --git a/net/core/sock.c b/net/core/sock.c index 026e01f70274..ca9b65199d28 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -491,7 +491,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) skb->dev = NULL; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 6521dfd8b7c8..a8770391ea5b 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -43,31 +43,22 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) return; type = classify(skb); + if (type == PTP_CLASS_NONE) + return; + + phydev = skb->dev->phydev; + if (likely(phydev->drv->txtstamp)) { + if (!atomic_inc_not_zero(&sk->sk_refcnt)) + return; - switch (type) { - case PTP_CLASS_V1_IPV4: - case PTP_CLASS_V1_IPV6: - case PTP_CLASS_V2_IPV4: - case PTP_CLASS_V2_IPV6: - case PTP_CLASS_V2_L2: - case PTP_CLASS_V2_VLAN: - phydev = skb->dev->phydev; - if (likely(phydev->drv->txtstamp)) { - if (!atomic_inc_not_zero(&sk->sk_refcnt)) - return; - - clone = skb_clone(skb, GFP_ATOMIC); - if (!clone) { - sock_put(sk); - return; - } - - clone->sk = sk; - phydev->drv->txtstamp(phydev, clone, type); + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return; } - break; - default: - break; + + clone->sk = sk; + phydev->drv->txtstamp(phydev, clone, type); } } EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); @@ -114,20 +105,12 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) __skb_pull(skb, ETH_HLEN); - switch (type) { - case PTP_CLASS_V1_IPV4: - case PTP_CLASS_V1_IPV6: - case PTP_CLASS_V2_IPV4: - case PTP_CLASS_V2_IPV6: - case PTP_CLASS_V2_L2: - case PTP_CLASS_V2_VLAN: - phydev = skb->dev->phydev; - if (likely(phydev->drv->rxtstamp)) - return phydev->drv->rxtstamp(phydev, skb, type); - break; - default: - break; - } + if (type == PTP_CLASS_NONE) + return false; + + phydev = skb->dev->phydev; + if (likely(phydev->drv->rxtstamp)) + return phydev->drv->rxtstamp(phydev, skb, type); return false; } diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index f8b98d89c285..c34af7a1d2d4 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -471,7 +471,11 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh, id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); if (netdev->dcbnl_ops->getapp) { - up = netdev->dcbnl_ops->getapp(netdev, idtype, id); + ret = netdev->dcbnl_ops->getapp(netdev, idtype, id); + if (ret < 0) + return ret; + else + up = ret; } else { struct dcb_app app = { .selector = idtype, @@ -538,6 +542,8 @@ static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh, if (netdev->dcbnl_ops->setapp) { ret = netdev->dcbnl_ops->setapp(netdev, idtype, id, up); + if (ret < 0) + return ret; } else { struct dcb_app app; app.selector = idtype; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4db3c2a1679c..04cb17d4b0ce 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -386,7 +386,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet6_reqsk_alloc(&dccp6_request_sock_ops); + req = inet_reqsk_alloc(&dccp6_request_sock_ops); if (req == NULL) goto drop; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index c69eb9c4fbb8..b50dc436db1f 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -55,11 +55,9 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { - const struct ipv6_pinfo *np = inet6_sk(sk); - tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - tw->tw_ipv6only = np->ipv6only; + tw->tw_ipv6only = sk->sk_ipv6only; } #endif /* Linkage updates. */ diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 9acec61f5433..dd8696a3dbec 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -150,7 +150,7 @@ int dns_query(const char *type, const char *name, size_t namelen, goto put; memcpy(*_result, upayload->data, len); - *_result[len] = '\0'; + (*_result)[len] = '\0'; if (_expiry) *_expiry = rkey->expiry; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 5db37cef50a9..0a49632fac47 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -351,8 +351,7 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) for (i = 0; i < pd->nr_chips; i++) { port_index = 0; while (port_index < DSA_MAX_PORTS) { - if (pd->chip[i].port_names[port_index]) - kfree(pd->chip[i].port_names[port_index]); + kfree(pd->chip[i].port_names[port_index]); port_index++; } kfree(pd->chip[i].rtable); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 64c5af0a10dd..45a1e34c89e0 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -340,8 +340,8 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, struct dsa_slave_priv *p; int ret; - slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), - name, ether_setup); + slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), name, + NET_NAME_UNKNOWN, ether_setup); if (slave_dev == NULL) return slave_dev; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 5dc638cad2e1..f405e0592407 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -390,7 +390,8 @@ EXPORT_SYMBOL(ether_setup); struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs) { - return alloc_netdev_mqs(sizeof_priv, "eth%d", ether_setup, txqs, rxqs); + return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_UNKNOWN, + ether_setup, txqs, rxqs); } EXPORT_SYMBOL(alloc_etherdev_mqs); diff --git a/net/hsr/Makefile b/net/hsr/Makefile index b68359f181cc..9ae972a820f4 100644 --- a/net/hsr/Makefile +++ b/net/hsr/Makefile @@ -4,4 +4,5 @@ obj-$(CONFIG_HSR) += hsr.o -hsr-y := hsr_main.o hsr_framereg.o hsr_device.o hsr_netlink.o +hsr-y := hsr_main.o hsr_framereg.o hsr_device.o \ + hsr_netlink.o hsr_slave.o hsr_forward.o diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index e5302b7f7ca9..a138d75751df 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,7 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * This file contains device methods for creating, using and destroying * virtual HSR devices. @@ -15,12 +15,13 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> -#include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/pkt_sched.h> #include "hsr_device.h" +#include "hsr_slave.h" #include "hsr_framereg.h" #include "hsr_main.h" +#include "hsr_forward.h" static bool is_admin_up(struct net_device *dev) @@ -45,75 +46,108 @@ static void __hsr_set_operstate(struct net_device *dev, int transition) } } -void hsr_set_operstate(struct net_device *hsr_dev, struct net_device *slave1, - struct net_device *slave2) +static void hsr_set_operstate(struct hsr_port *master, bool has_carrier) { - if (!is_admin_up(hsr_dev)) { - __hsr_set_operstate(hsr_dev, IF_OPER_DOWN); + if (!is_admin_up(master->dev)) { + __hsr_set_operstate(master->dev, IF_OPER_DOWN); return; } - if (is_slave_up(slave1) || is_slave_up(slave2)) - __hsr_set_operstate(hsr_dev, IF_OPER_UP); + if (has_carrier) + __hsr_set_operstate(master->dev, IF_OPER_UP); else - __hsr_set_operstate(hsr_dev, IF_OPER_LOWERLAYERDOWN); + __hsr_set_operstate(master->dev, IF_OPER_LOWERLAYERDOWN); } -void hsr_set_carrier(struct net_device *hsr_dev, struct net_device *slave1, - struct net_device *slave2) +static bool hsr_check_carrier(struct hsr_port *master) { - if (is_slave_up(slave1) || is_slave_up(slave2)) - netif_carrier_on(hsr_dev); + struct hsr_port *port; + bool has_carrier; + + has_carrier = false; + + rcu_read_lock(); + hsr_for_each_port(master->hsr, port) + if ((port->type != HSR_PT_MASTER) && is_slave_up(port->dev)) { + has_carrier = true; + break; + } + rcu_read_unlock(); + + if (has_carrier) + netif_carrier_on(master->dev); else - netif_carrier_off(hsr_dev); + netif_carrier_off(master->dev); + + return has_carrier; } -void hsr_check_announce(struct net_device *hsr_dev, int old_operstate) +static void hsr_check_announce(struct net_device *hsr_dev, + unsigned char old_operstate) { - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; - hsr_priv = netdev_priv(hsr_dev); + hsr = netdev_priv(hsr_dev); if ((hsr_dev->operstate == IF_OPER_UP) && (old_operstate != IF_OPER_UP)) { /* Went up */ - hsr_priv->announce_count = 0; - hsr_priv->announce_timer.expires = jiffies + + hsr->announce_count = 0; + hsr->announce_timer.expires = jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); - add_timer(&hsr_priv->announce_timer); + add_timer(&hsr->announce_timer); } if ((hsr_dev->operstate != IF_OPER_UP) && (old_operstate == IF_OPER_UP)) /* Went down */ - del_timer(&hsr_priv->announce_timer); + del_timer(&hsr->announce_timer); } - -int hsr_get_max_mtu(struct hsr_priv *hsr_priv) +void hsr_check_carrier_and_operstate(struct hsr_priv *hsr) { - int mtu_max; - - if (hsr_priv->slave[0] && hsr_priv->slave[1]) - mtu_max = min(hsr_priv->slave[0]->mtu, hsr_priv->slave[1]->mtu); - else if (hsr_priv->slave[0]) - mtu_max = hsr_priv->slave[0]->mtu; - else if (hsr_priv->slave[1]) - mtu_max = hsr_priv->slave[1]->mtu; - else - mtu_max = HSR_TAGLEN; + struct hsr_port *master; + unsigned char old_operstate; + bool has_carrier; - return mtu_max - HSR_TAGLEN; + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + /* netif_stacked_transfer_operstate() cannot be used here since + * it doesn't set IF_OPER_LOWERLAYERDOWN (?) + */ + old_operstate = master->dev->operstate; + has_carrier = hsr_check_carrier(master); + hsr_set_operstate(master, has_carrier); + hsr_check_announce(master->dev, old_operstate); } +int hsr_get_max_mtu(struct hsr_priv *hsr) +{ + unsigned int mtu_max; + struct hsr_port *port; + + mtu_max = ETH_DATA_LEN; + rcu_read_lock(); + hsr_for_each_port(hsr, port) + if (port->type != HSR_PT_MASTER) + mtu_max = min(port->dev->mtu, mtu_max); + rcu_read_unlock(); + + if (mtu_max < HSR_HLEN) + return 0; + return mtu_max - HSR_HLEN; +} + + static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) { - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; + struct hsr_port *master; - hsr_priv = netdev_priv(dev); + hsr = netdev_priv(dev); + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); - if (new_mtu > hsr_get_max_mtu(hsr_priv)) { - netdev_info(hsr_priv->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", - HSR_TAGLEN); + if (new_mtu > hsr_get_max_mtu(hsr)) { + netdev_info(master->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", + HSR_HLEN); return -EINVAL; } @@ -124,164 +158,95 @@ static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) static int hsr_dev_open(struct net_device *dev) { - struct hsr_priv *hsr_priv; - int i; - char *slave_name; + struct hsr_priv *hsr; + struct hsr_port *port; + char designation; - hsr_priv = netdev_priv(dev); + hsr = netdev_priv(dev); + designation = '\0'; - for (i = 0; i < HSR_MAX_SLAVE; i++) { - if (hsr_priv->slave[i]) - slave_name = hsr_priv->slave[i]->name; - else - slave_name = "null"; - - if (!is_slave_up(hsr_priv->slave[i])) - netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a working HSR network\n", - 'A' + i, slave_name); + rcu_read_lock(); + hsr_for_each_port(hsr, port) { + if (port->type == HSR_PT_MASTER) + continue; + switch (port->type) { + case HSR_PT_SLAVE_A: + designation = 'A'; + break; + case HSR_PT_SLAVE_B: + designation = 'B'; + break; + default: + designation = '?'; + } + if (!is_slave_up(port->dev)) + netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a fully working HSR network\n", + designation, port->dev->name); } + rcu_read_unlock(); + + if (designation == '\0') + netdev_warn(dev, "No slave devices configured\n"); return 0; } + static int hsr_dev_close(struct net_device *dev) { - /* Nothing to do here. We could try to restore the state of the slaves - * to what they were before being changed by the hsr master dev's state, - * but they might have been changed manually in the mean time too, so - * taking them up or down here might be confusing and is probably not a - * good idea. - */ + /* Nothing to do here. */ return 0; } -static void hsr_fill_tag(struct hsr_ethhdr *hsr_ethhdr, struct hsr_priv *hsr_priv) +static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, + netdev_features_t features) { - unsigned long irqflags; + netdev_features_t mask; + struct hsr_port *port; - /* IEC 62439-1:2010, p 48, says the 4-bit "path" field can take values - * between 0001-1001 ("ring identifier", for regular HSR frames), - * or 1111 ("HSR management", supervision frames). Unfortunately, the - * spec writers forgot to explain what a "ring identifier" is, or - * how it is used. So we just set this to 0001 for regular frames, - * and 1111 for supervision frames. - */ - set_hsr_tag_path(&hsr_ethhdr->hsr_tag, 0x1); + mask = features; - /* IEC 62439-1:2010, p 12: "The link service data unit in an Ethernet - * frame is the content of the frame located between the Length/Type - * field and the Frame Check Sequence." + /* Mask out all features that, if supported by one device, should be + * enabled for all devices (see NETIF_F_ONE_FOR_ALL). * - * IEC 62439-3, p 48, specifies the "original LPDU" to include the - * original "LT" field (what "LT" means is not explained anywhere as - * far as I can see - perhaps "Length/Type"?). So LSDU_size might - * equal original length + 2. - * Also, the fact that this field is not used anywhere (might be used - * by a RedBox connecting HSR and PRP nets?) means I cannot test its - * correctness. Instead of guessing, I set this to 0 here, to make any - * problems immediately apparent. Anyone using this driver with PRP/HSR - * RedBoxes might need to fix this... + * Anything that's off in mask will not be enabled - so only things + * that were in features originally, and also is in NETIF_F_ONE_FOR_ALL, + * may become enabled. */ - set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, 0); - - spin_lock_irqsave(&hsr_priv->seqnr_lock, irqflags); - hsr_ethhdr->hsr_tag.sequence_nr = htons(hsr_priv->sequence_nr); - hsr_priv->sequence_nr++; - spin_unlock_irqrestore(&hsr_priv->seqnr_lock, irqflags); + features &= ~NETIF_F_ONE_FOR_ALL; + hsr_for_each_port(hsr, port) + features = netdev_increment_features(features, + port->dev->features, + mask); - hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; - - hsr_ethhdr->ethhdr.h_proto = htons(ETH_P_PRP); + return features; } -static int slave_xmit(struct sk_buff *skb, struct hsr_priv *hsr_priv, - enum hsr_dev_idx dev_idx) +static netdev_features_t hsr_fix_features(struct net_device *dev, + netdev_features_t features) { - struct hsr_ethhdr *hsr_ethhdr; - - hsr_ethhdr = (struct hsr_ethhdr *) skb->data; + struct hsr_priv *hsr = netdev_priv(dev); - skb->dev = hsr_priv->slave[dev_idx]; - - hsr_addr_subst_dest(hsr_priv, &hsr_ethhdr->ethhdr, dev_idx); - - /* Address substitution (IEC62439-3 pp 26, 50): replace mac - * address of outgoing frame with that of the outgoing slave's. - */ - ether_addr_copy(hsr_ethhdr->ethhdr.h_source, skb->dev->dev_addr); - - return dev_queue_xmit(skb); + return hsr_features_recompute(hsr, features); } static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) { - struct hsr_priv *hsr_priv; - struct hsr_ethhdr *hsr_ethhdr; - struct sk_buff *skb2; - int res1, res2; - - hsr_priv = netdev_priv(dev); - hsr_ethhdr = (struct hsr_ethhdr *) skb->data; - - if ((skb->protocol != htons(ETH_P_PRP)) || - (hsr_ethhdr->ethhdr.h_proto != htons(ETH_P_PRP))) { - hsr_fill_tag(hsr_ethhdr, hsr_priv); - skb->protocol = htons(ETH_P_PRP); - } - - skb2 = pskb_copy(skb, GFP_ATOMIC); - - res1 = NET_XMIT_DROP; - if (likely(hsr_priv->slave[HSR_DEV_SLAVE_A])) - res1 = slave_xmit(skb, hsr_priv, HSR_DEV_SLAVE_A); + struct hsr_priv *hsr = netdev_priv(dev); + struct hsr_port *master; - res2 = NET_XMIT_DROP; - if (likely(skb2 && hsr_priv->slave[HSR_DEV_SLAVE_B])) - res2 = slave_xmit(skb2, hsr_priv, HSR_DEV_SLAVE_B); - - if (likely(res1 == NET_XMIT_SUCCESS || res1 == NET_XMIT_CN || - res2 == NET_XMIT_SUCCESS || res2 == NET_XMIT_CN)) { - hsr_priv->dev->stats.tx_packets++; - hsr_priv->dev->stats.tx_bytes += skb->len; - } else { - hsr_priv->dev->stats.tx_dropped++; - } + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + skb->dev = master->dev; + hsr_forward_skb(skb, master); return NETDEV_TX_OK; } -static int hsr_header_create(struct sk_buff *skb, struct net_device *dev, - unsigned short type, const void *daddr, - const void *saddr, unsigned int len) -{ - int res; - - /* Make room for the HSR tag now. We will fill it in later (in - * hsr_dev_xmit) - */ - if (skb_headroom(skb) < HSR_TAGLEN + ETH_HLEN) - return -ENOBUFS; - skb_push(skb, HSR_TAGLEN); - - /* To allow VLAN/HSR combos we should probably use - * res = dev_hard_header(skb, dev, type, daddr, saddr, len + HSR_TAGLEN); - * here instead. It would require other changes too, though - e.g. - * separate headers for each slave etc... - */ - res = eth_header(skb, dev, type, daddr, saddr, len + HSR_TAGLEN); - if (res <= 0) - return res; - skb_reset_mac_header(skb); - - return res + HSR_TAGLEN; -} - - static const struct header_ops hsr_header_ops = { - .create = hsr_header_create, + .create = eth_header, .parse = eth_header_parse, }; @@ -291,67 +256,63 @@ static const struct header_ops hsr_header_ops = { */ static int hsr_pad(int size) { - const int min_size = ETH_ZLEN - HSR_TAGLEN - ETH_HLEN; + const int min_size = ETH_ZLEN - HSR_HLEN - ETH_HLEN; if (size >= min_size) return size; return min_size; } -static void send_hsr_supervision_frame(struct net_device *hsr_dev, u8 type) +static void send_hsr_supervision_frame(struct hsr_port *master, u8 type) { - struct hsr_priv *hsr_priv; struct sk_buff *skb; int hlen, tlen; struct hsr_sup_tag *hsr_stag; struct hsr_sup_payload *hsr_sp; unsigned long irqflags; - hlen = LL_RESERVED_SPACE(hsr_dev); - tlen = hsr_dev->needed_tailroom; + hlen = LL_RESERVED_SPACE(master->dev); + tlen = master->dev->needed_tailroom; skb = alloc_skb(hsr_pad(sizeof(struct hsr_sup_payload)) + hlen + tlen, GFP_ATOMIC); if (skb == NULL) return; - hsr_priv = netdev_priv(hsr_dev); - skb_reserve(skb, hlen); - skb->dev = hsr_dev; + skb->dev = master->dev; skb->protocol = htons(ETH_P_PRP); skb->priority = TC_PRIO_CONTROL; if (dev_hard_header(skb, skb->dev, ETH_P_PRP, - hsr_priv->sup_multicast_addr, - skb->dev->dev_addr, skb->len) < 0) + master->hsr->sup_multicast_addr, + skb->dev->dev_addr, skb->len) <= 0) goto out; + skb_reset_mac_header(skb); - skb_pull(skb, sizeof(struct ethhdr)); - hsr_stag = (typeof(hsr_stag)) skb->data; + hsr_stag = (typeof(hsr_stag)) skb_put(skb, sizeof(*hsr_stag)); set_hsr_stag_path(hsr_stag, 0xf); set_hsr_stag_HSR_Ver(hsr_stag, 0); - spin_lock_irqsave(&hsr_priv->seqnr_lock, irqflags); - hsr_stag->sequence_nr = htons(hsr_priv->sequence_nr); - hsr_priv->sequence_nr++; - spin_unlock_irqrestore(&hsr_priv->seqnr_lock, irqflags); + spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags); + hsr_stag->sequence_nr = htons(master->hsr->sequence_nr); + master->hsr->sequence_nr++; + spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); hsr_stag->HSR_TLV_Type = type; hsr_stag->HSR_TLV_Length = 12; - skb_push(skb, sizeof(struct ethhdr)); - /* Payload: MacAddressA */ hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(*hsr_sp)); - ether_addr_copy(hsr_sp->MacAddressA, hsr_dev->dev_addr); + ether_addr_copy(hsr_sp->MacAddressA, master->dev->dev_addr); - dev_queue_xmit(skb); + hsr_forward_skb(skb, master); return; out: + WARN_ON_ONCE("HSR: Could not send supervision frame\n"); kfree_skb(skb); } @@ -360,59 +321,32 @@ out: */ static void hsr_announce(unsigned long data) { - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; + struct hsr_port *master; - hsr_priv = (struct hsr_priv *) data; + hsr = (struct hsr_priv *) data; - if (hsr_priv->announce_count < 3) { - send_hsr_supervision_frame(hsr_priv->dev, HSR_TLV_ANNOUNCE); - hsr_priv->announce_count++; + rcu_read_lock(); + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + + if (hsr->announce_count < 3) { + send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE); + hsr->announce_count++; } else { - send_hsr_supervision_frame(hsr_priv->dev, HSR_TLV_LIFE_CHECK); + send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK); } - if (hsr_priv->announce_count < 3) - hsr_priv->announce_timer.expires = jiffies + + if (hsr->announce_count < 3) + hsr->announce_timer.expires = jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); else - hsr_priv->announce_timer.expires = jiffies + + hsr->announce_timer.expires = jiffies + msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); - if (is_admin_up(hsr_priv->dev)) - add_timer(&hsr_priv->announce_timer); -} - - -static void restore_slaves(struct net_device *hsr_dev) -{ - struct hsr_priv *hsr_priv; - int i; - int res; - - hsr_priv = netdev_priv(hsr_dev); - - rtnl_lock(); - - /* Restore promiscuity */ - for (i = 0; i < HSR_MAX_SLAVE; i++) { - if (!hsr_priv->slave[i]) - continue; - res = dev_set_promiscuity(hsr_priv->slave[i], -1); - if (res) - netdev_info(hsr_dev, - "Cannot restore slave promiscuity (%s, %d)\n", - hsr_priv->slave[i]->name, res); - } - - rtnl_unlock(); -} - -static void reclaim_hsr_dev(struct rcu_head *rh) -{ - struct hsr_priv *hsr_priv; + if (is_admin_up(master->dev)) + add_timer(&hsr->announce_timer); - hsr_priv = container_of(rh, struct hsr_priv, rcu_head); - free_netdev(hsr_priv->dev); + rcu_read_unlock(); } @@ -421,14 +355,18 @@ static void reclaim_hsr_dev(struct rcu_head *rh) */ static void hsr_dev_destroy(struct net_device *hsr_dev) { - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; + struct hsr_port *port; - hsr_priv = netdev_priv(hsr_dev); + hsr = netdev_priv(hsr_dev); + hsr_for_each_port(hsr, port) + hsr_del_port(port); - del_timer(&hsr_priv->announce_timer); - unregister_hsr_master(hsr_priv); /* calls list_del_rcu on hsr_priv */ - restore_slaves(hsr_dev); - call_rcu(&hsr_priv->rcu_head, reclaim_hsr_dev); /* reclaim hsr_priv */ + del_timer_sync(&hsr->prune_timer); + del_timer_sync(&hsr->announce_timer); + + synchronize_rcu(); + free_netdev(hsr_dev); } static const struct net_device_ops hsr_device_ops = { @@ -436,62 +374,51 @@ static const struct net_device_ops hsr_device_ops = { .ndo_open = hsr_dev_open, .ndo_stop = hsr_dev_close, .ndo_start_xmit = hsr_dev_xmit, + .ndo_fix_features = hsr_fix_features, }; +static struct device_type hsr_type = { + .name = "hsr", +}; void hsr_dev_setup(struct net_device *dev) { random_ether_addr(dev->dev_addr); ether_setup(dev); - dev->header_ops = &hsr_header_ops; - dev->netdev_ops = &hsr_device_ops; - dev->tx_queue_len = 0; + dev->header_ops = &hsr_header_ops; + dev->netdev_ops = &hsr_device_ops; + SET_NETDEV_DEVTYPE(dev, &hsr_type); + dev->tx_queue_len = 0; dev->destructor = hsr_dev_destroy; + + dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | + NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | + NETIF_F_HW_VLAN_CTAG_TX; + + dev->features = dev->hw_features; + + /* Prevent recursive tx locking */ + dev->features |= NETIF_F_LLTX; + /* VLAN on top of HSR needs testing and probably some work on + * hsr_header_create() etc. + */ + dev->features |= NETIF_F_VLAN_CHALLENGED; + /* Not sure about this. Taken from bridge code. netdev_features.h says + * it means "Does not change network namespaces". + */ + dev->features |= NETIF_F_NETNS_LOCAL; } /* Return true if dev is a HSR master; return false otherwise. */ -bool is_hsr_master(struct net_device *dev) +inline bool is_hsr_master(struct net_device *dev) { return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit); } -static int check_slave_ok(struct net_device *dev) -{ - /* Don't allow HSR on non-ethernet like devices */ - if ((dev->flags & IFF_LOOPBACK) || (dev->type != ARPHRD_ETHER) || - (dev->addr_len != ETH_ALEN)) { - netdev_info(dev, "Cannot use loopback or non-ethernet device as HSR slave.\n"); - return -EINVAL; - } - - /* Don't allow enslaving hsr devices */ - if (is_hsr_master(dev)) { - netdev_info(dev, "Cannot create trees of HSR devices.\n"); - return -EINVAL; - } - - if (is_hsr_slave(dev)) { - netdev_info(dev, "This device is already a HSR slave.\n"); - return -EINVAL; - } - - if (dev->priv_flags & IFF_802_1Q_VLAN) { - netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n"); - return -EINVAL; - } - - /* HSR over bonded devices has not been tested, but I'm not sure it - * won't work... - */ - - return 0; -} - - /* Default multicast address for HSR Supervision frames */ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00 @@ -500,97 +427,74 @@ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], unsigned char multicast_spec) { - struct hsr_priv *hsr_priv; - int i; + struct hsr_priv *hsr; + struct hsr_port *port; int res; - hsr_priv = netdev_priv(hsr_dev); - hsr_priv->dev = hsr_dev; - INIT_LIST_HEAD(&hsr_priv->node_db); - INIT_LIST_HEAD(&hsr_priv->self_node_db); - for (i = 0; i < HSR_MAX_SLAVE; i++) - hsr_priv->slave[i] = slave[i]; - - spin_lock_init(&hsr_priv->seqnr_lock); - /* Overflow soon to find bugs easier: */ - hsr_priv->sequence_nr = USHRT_MAX - 1024; - - init_timer(&hsr_priv->announce_timer); - hsr_priv->announce_timer.function = hsr_announce; - hsr_priv->announce_timer.data = (unsigned long) hsr_priv; + hsr = netdev_priv(hsr_dev); + INIT_LIST_HEAD(&hsr->ports); + INIT_LIST_HEAD(&hsr->node_db); + INIT_LIST_HEAD(&hsr->self_node_db); - ether_addr_copy(hsr_priv->sup_multicast_addr, def_multicast_addr); - hsr_priv->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; + ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); -/* FIXME: should I modify the value of these? - * - * - hsr_dev->flags - i.e. - * IFF_MASTER/SLAVE? - * - hsr_dev->priv_flags - i.e. - * IFF_EBRIDGE? - * IFF_TX_SKB_SHARING? - * IFF_HSR_MASTER/SLAVE? - */ + /* Make sure we recognize frames from ourselves in hsr_rcv() */ + res = hsr_create_self_node(&hsr->self_node_db, hsr_dev->dev_addr, + slave[1]->dev_addr); + if (res < 0) + return res; - for (i = 0; i < HSR_MAX_SLAVE; i++) { - res = check_slave_ok(slave[i]); - if (res) - return res; - } + spin_lock_init(&hsr->seqnr_lock); + /* Overflow soon to find bugs easier: */ + hsr->sequence_nr = HSR_SEQNR_START; - hsr_dev->features = slave[0]->features & slave[1]->features; - /* Prevent recursive tx locking */ - hsr_dev->features |= NETIF_F_LLTX; - /* VLAN on top of HSR needs testing and probably some work on - * hsr_header_create() etc. - */ - hsr_dev->features |= NETIF_F_VLAN_CHALLENGED; + init_timer(&hsr->announce_timer); + hsr->announce_timer.function = hsr_announce; + hsr->announce_timer.data = (unsigned long) hsr; - /* Set hsr_dev's MAC address to that of mac_slave1 */ - ether_addr_copy(hsr_dev->dev_addr, hsr_priv->slave[0]->dev_addr); + init_timer(&hsr->prune_timer); + hsr->prune_timer.function = hsr_prune_nodes; + hsr->prune_timer.data = (unsigned long) hsr; - /* Set required header length */ - for (i = 0; i < HSR_MAX_SLAVE; i++) { - if (slave[i]->hard_header_len + HSR_TAGLEN > - hsr_dev->hard_header_len) - hsr_dev->hard_header_len = - slave[i]->hard_header_len + HSR_TAGLEN; - } + ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr); + hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; - /* MTU */ - for (i = 0; i < HSR_MAX_SLAVE; i++) - if (slave[i]->mtu - HSR_TAGLEN < hsr_dev->mtu) - hsr_dev->mtu = slave[i]->mtu - HSR_TAGLEN; + /* FIXME: should I modify the value of these? + * + * - hsr_dev->flags - i.e. + * IFF_MASTER/SLAVE? + * - hsr_dev->priv_flags - i.e. + * IFF_EBRIDGE? + * IFF_TX_SKB_SHARING? + * IFF_HSR_MASTER/SLAVE? + */ /* Make sure the 1st call to netif_carrier_on() gets through */ netif_carrier_off(hsr_dev); - /* Promiscuity */ - for (i = 0; i < HSR_MAX_SLAVE; i++) { - res = dev_set_promiscuity(slave[i], 1); - if (res) { - netdev_info(hsr_dev, "Cannot set slave promiscuity (%s, %d)\n", - slave[i]->name, res); - goto fail; - } - } + res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER); + if (res) + return res; - /* Make sure we recognize frames from ourselves in hsr_rcv() */ - res = hsr_create_self_node(&hsr_priv->self_node_db, - hsr_dev->dev_addr, - hsr_priv->slave[1]->dev_addr); - if (res < 0) + res = register_netdevice(hsr_dev); + if (res) goto fail; - res = register_netdevice(hsr_dev); + res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A); + if (res) + goto fail; + res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B); if (res) goto fail; - register_hsr_master(hsr_priv); + hsr->prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD); + add_timer(&hsr->prune_timer); return 0; fail: - restore_slaves(hsr_dev); + hsr_for_each_port(hsr, port) + hsr_del_port(port); + return res; } diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index 2c7148e73914..108a5d59d2a6 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,7 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ #ifndef __HSR_DEVICE_H @@ -18,12 +18,8 @@ void hsr_dev_setup(struct net_device *dev); int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], unsigned char multicast_spec); -void hsr_set_operstate(struct net_device *hsr_dev, struct net_device *slave1, - struct net_device *slave2); -void hsr_set_carrier(struct net_device *hsr_dev, struct net_device *slave1, - struct net_device *slave2); -void hsr_check_announce(struct net_device *hsr_dev, int old_operstate); +void hsr_check_carrier_and_operstate(struct hsr_priv *hsr); bool is_hsr_master(struct net_device *dev); -int hsr_get_max_mtu(struct hsr_priv *hsr_priv); +int hsr_get_max_mtu(struct hsr_priv *hsr); #endif /* __HSR_DEVICE_H */ diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c new file mode 100644 index 000000000000..7871ed6d3825 --- /dev/null +++ b/net/hsr/hsr_forward.c @@ -0,0 +1,368 @@ +/* Copyright 2011-2014 Autronica Fire and Security AS + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Author(s): + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + */ + +#include "hsr_forward.h" +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/etherdevice.h> +#include <linux/if_vlan.h> +#include "hsr_main.h" +#include "hsr_framereg.h" + + +struct hsr_node; + +struct hsr_frame_info { + struct sk_buff *skb_std; + struct sk_buff *skb_hsr; + struct hsr_port *port_rcv; + struct hsr_node *node_src; + u16 sequence_nr; + bool is_supervision; + bool is_vlan; + bool is_local_dest; + bool is_local_exclusive; +}; + + +/* The uses I can see for these HSR supervision frames are: + * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type = + * 22") to reset any sequence_nr counters belonging to that node. Useful if + * the other node's counter has been reset for some reason. + * -- + * Or not - resetting the counter and bridging the frame would create a + * loop, unfortunately. + * + * 2) Use the LifeCheck frames to detect ring breaks. I.e. if no LifeCheck + * frame is received from a particular node, we know something is wrong. + * We just register these (as with normal frames) and throw them away. + * + * 3) Allow different MAC addresses for the two slave interfaces, using the + * MacAddressA field. + */ +static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) +{ + struct hsr_ethhdr_sp *hdr; + + WARN_ON_ONCE(!skb_mac_header_was_set(skb)); + hdr = (struct hsr_ethhdr_sp *) skb_mac_header(skb); + + if (!ether_addr_equal(hdr->ethhdr.h_dest, + hsr->sup_multicast_addr)) + return false; + + if (get_hsr_stag_path(&hdr->hsr_sup) != 0x0f) + return false; + if ((hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_ANNOUNCE) && + (hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_LIFE_CHECK)) + return false; + if (hdr->hsr_sup.HSR_TLV_Length != 12) + return false; + + return true; +} + + +static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in, + struct hsr_frame_info *frame) +{ + struct sk_buff *skb; + int copylen; + unsigned char *dst, *src; + + skb_pull(skb_in, HSR_HLEN); + skb = __pskb_copy(skb_in, skb_headroom(skb_in) - HSR_HLEN, GFP_ATOMIC); + skb_push(skb_in, HSR_HLEN); + if (skb == NULL) + return NULL; + + skb_reset_mac_header(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start -= HSR_HLEN; + + copylen = 2*ETH_ALEN; + if (frame->is_vlan) + copylen += VLAN_HLEN; + src = skb_mac_header(skb_in); + dst = skb_mac_header(skb); + memcpy(dst, src, copylen); + + skb->protocol = eth_hdr(skb)->h_proto; + return skb; +} + +static struct sk_buff *frame_get_stripped_skb(struct hsr_frame_info *frame, + struct hsr_port *port) +{ + if (!frame->skb_std) + frame->skb_std = create_stripped_skb(frame->skb_hsr, frame); + return skb_clone(frame->skb_std, GFP_ATOMIC); +} + + +static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, + struct hsr_port *port) +{ + struct hsr_ethhdr *hsr_ethhdr; + int lane_id; + int lsdu_size; + + if (port->type == HSR_PT_SLAVE_A) + lane_id = 0; + else + lane_id = 1; + + lsdu_size = skb->len - 14; + if (frame->is_vlan) + lsdu_size -= 4; + + hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb); + + set_hsr_tag_path(&hsr_ethhdr->hsr_tag, lane_id); + set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); + hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); + hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; + hsr_ethhdr->ethhdr.h_proto = htons(ETH_P_PRP); +} + +static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, + struct hsr_frame_info *frame, + struct hsr_port *port) +{ + int movelen; + unsigned char *dst, *src; + struct sk_buff *skb; + + /* Create the new skb with enough headroom to fit the HSR tag */ + skb = __pskb_copy(skb_o, skb_headroom(skb_o) + HSR_HLEN, GFP_ATOMIC); + if (skb == NULL) + return NULL; + skb_reset_mac_header(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start += HSR_HLEN; + + movelen = ETH_HLEN; + if (frame->is_vlan) + movelen += VLAN_HLEN; + + src = skb_mac_header(skb); + dst = skb_push(skb, HSR_HLEN); + memmove(dst, src, movelen); + skb_reset_mac_header(skb); + + hsr_fill_tag(skb, frame, port); + + return skb; +} + +/* If the original frame was an HSR tagged frame, just clone it to be sent + * unchanged. Otherwise, create a private frame especially tagged for 'port'. + */ +static struct sk_buff *frame_get_tagged_skb(struct hsr_frame_info *frame, + struct hsr_port *port) +{ + if (frame->skb_hsr) + return skb_clone(frame->skb_hsr, GFP_ATOMIC); + + if ((port->type != HSR_PT_SLAVE_A) && (port->type != HSR_PT_SLAVE_B)) { + WARN_ONCE(1, "HSR: Bug: trying to create a tagged frame for a non-ring port"); + return NULL; + } + + return create_tagged_skb(frame->skb_std, frame, port); +} + + +static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, + struct hsr_node *node_src) +{ + bool was_multicast_frame; + int res; + + was_multicast_frame = (skb->pkt_type == PACKET_MULTICAST); + hsr_addr_subst_source(node_src, skb); + skb_pull(skb, ETH_HLEN); + res = netif_rx(skb); + if (res == NET_RX_DROP) { + dev->stats.rx_dropped++; + } else { + dev->stats.rx_packets++; + dev->stats.rx_bytes += skb->len; + if (was_multicast_frame) + dev->stats.multicast++; + } +} + +static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port, + struct hsr_frame_info *frame) +{ + if (frame->port_rcv->type == HSR_PT_MASTER) { + hsr_addr_subst_dest(frame->node_src, skb, port); + + /* Address substitution (IEC62439-3 pp 26, 50): replace mac + * address of outgoing frame with that of the outgoing slave's. + */ + ether_addr_copy(eth_hdr(skb)->h_source, port->dev->dev_addr); + } + return dev_queue_xmit(skb); +} + + +/* Forward the frame through all devices except: + * - Back through the receiving device + * - If it's a HSR frame: through a device where it has passed before + * - To the local HSR master only if the frame is directly addressed to it, or + * a non-supervision multicast or broadcast frame. + * + * HSR slave devices should insert a HSR tag into the frame, or forward the + * frame unchanged if it's already tagged. Interlink devices should strip HSR + * tags if they're of the non-HSR type (but only after duplicate discard). The + * master device always strips HSR tags. + */ +static void hsr_forward_do(struct hsr_frame_info *frame) +{ + struct hsr_port *port; + struct sk_buff *skb; + + hsr_for_each_port(frame->port_rcv->hsr, port) { + /* Don't send frame back the way it came */ + if (port == frame->port_rcv) + continue; + + /* Don't deliver locally unless we should */ + if ((port->type == HSR_PT_MASTER) && !frame->is_local_dest) + continue; + + /* Deliver frames directly addressed to us to master only */ + if ((port->type != HSR_PT_MASTER) && frame->is_local_exclusive) + continue; + + /* Don't send frame over port where it has been sent before */ + if (hsr_register_frame_out(port, frame->node_src, + frame->sequence_nr)) + continue; + + if (frame->is_supervision && (port->type == HSR_PT_MASTER)) { + hsr_handle_sup_frame(frame->skb_hsr, + frame->node_src, + frame->port_rcv); + continue; + } + + if (port->type != HSR_PT_MASTER) + skb = frame_get_tagged_skb(frame, port); + else + skb = frame_get_stripped_skb(frame, port); + if (skb == NULL) { + /* FIXME: Record the dropped frame? */ + continue; + } + + skb->dev = port->dev; + if (port->type == HSR_PT_MASTER) + hsr_deliver_master(skb, port->dev, frame->node_src); + else + hsr_xmit(skb, port, frame); + } +} + + +static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb, + struct hsr_frame_info *frame) +{ + struct net_device *master_dev; + + master_dev = hsr_port_get_hsr(hsr, HSR_PT_MASTER)->dev; + + if (hsr_addr_is_self(hsr, eth_hdr(skb)->h_dest)) { + frame->is_local_exclusive = true; + skb->pkt_type = PACKET_HOST; + } else { + frame->is_local_exclusive = false; + } + + if ((skb->pkt_type == PACKET_HOST) || + (skb->pkt_type == PACKET_MULTICAST) || + (skb->pkt_type == PACKET_BROADCAST)) { + frame->is_local_dest = true; + } else { + frame->is_local_dest = false; + } +} + + +static int hsr_fill_frame_info(struct hsr_frame_info *frame, + struct sk_buff *skb, struct hsr_port *port) +{ + struct ethhdr *ethhdr; + unsigned long irqflags; + + frame->is_supervision = is_supervision_frame(port->hsr, skb); + frame->node_src = hsr_get_node(&port->hsr->node_db, skb, + frame->is_supervision); + if (frame->node_src == NULL) + return -1; /* Unknown node and !is_supervision, or no mem */ + + ethhdr = (struct ethhdr *) skb_mac_header(skb); + frame->is_vlan = false; + if (ethhdr->h_proto == htons(ETH_P_8021Q)) { + frame->is_vlan = true; + /* FIXME: */ + WARN_ONCE(1, "HSR: VLAN not yet supported"); + } + if (ethhdr->h_proto == htons(ETH_P_PRP)) { + frame->skb_std = NULL; + frame->skb_hsr = skb; + frame->sequence_nr = hsr_get_skb_sequence_nr(skb); + } else { + frame->skb_std = skb; + frame->skb_hsr = NULL; + /* Sequence nr for the master node */ + spin_lock_irqsave(&port->hsr->seqnr_lock, irqflags); + frame->sequence_nr = port->hsr->sequence_nr; + port->hsr->sequence_nr++; + spin_unlock_irqrestore(&port->hsr->seqnr_lock, irqflags); + } + + frame->port_rcv = port; + check_local_dest(port->hsr, skb, frame); + + return 0; +} + +/* Must be called holding rcu read lock (because of the port parameter) */ +void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) +{ + struct hsr_frame_info frame; + + if (skb_mac_header(skb) != skb->data) { + WARN_ONCE(1, "%s:%d: Malformed frame (port_src %s)\n", + __FILE__, __LINE__, port->dev->name); + goto out_drop; + } + + if (hsr_fill_frame_info(&frame, skb, port) < 0) + goto out_drop; + hsr_register_frame_in(frame.node_src, port, frame.sequence_nr); + hsr_forward_do(&frame); + + if (frame.skb_hsr != NULL) + kfree_skb(frame.skb_hsr); + if (frame.skb_std != NULL) + kfree_skb(frame.skb_std); + return; + +out_drop: + port->dev->stats.tx_dropped++; + kfree_skb(skb); +} diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h new file mode 100644 index 000000000000..5c5bc4b6b75f --- /dev/null +++ b/net/hsr/hsr_forward.h @@ -0,0 +1,20 @@ +/* Copyright 2011-2014 Autronica Fire and Security AS + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Author(s): + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + */ + +#ifndef __HSR_FORWARD_H +#define __HSR_FORWARD_H + +#include <linux/netdevice.h> +#include "hsr_main.h" + +void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port); + +#endif /* __HSR_FORWARD_H */ diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 83e58449366a..bace124d14ef 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,7 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * The HSR spec says never to forward the same frame twice on the same * interface. A frame is identified by its source MAC address and its HSR @@ -23,71 +23,68 @@ #include "hsr_netlink.h" -struct node_entry { - struct list_head mac_list; - unsigned char MacAddressA[ETH_ALEN]; - unsigned char MacAddressB[ETH_ALEN]; - enum hsr_dev_idx AddrB_if; /* The local slave through which AddrB - * frames are received from this node - */ - unsigned long time_in[HSR_MAX_SLAVE]; - bool time_in_stale[HSR_MAX_SLAVE]; - u16 seq_out[HSR_MAX_DEV]; - struct rcu_head rcu_head; +struct hsr_node { + struct list_head mac_list; + unsigned char MacAddressA[ETH_ALEN]; + unsigned char MacAddressB[ETH_ALEN]; + /* Local slave through which AddrB frames are received from this node */ + enum hsr_port_type AddrB_port; + unsigned long time_in[HSR_PT_PORTS]; + bool time_in_stale[HSR_PT_PORTS]; + u16 seq_out[HSR_PT_PORTS]; + struct rcu_head rcu_head; }; -/* TODO: use hash lists for mac addresses (linux/jhash.h)? */ +/* TODO: use hash lists for mac addresses (linux/jhash.h)? */ -/* Search for mac entry. Caller must hold rcu read lock. +/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, + * false otherwise. */ -static struct node_entry *find_node_by_AddrA(struct list_head *node_db, - const unsigned char addr[ETH_ALEN]) +static bool seq_nr_after(u16 a, u16 b) { - struct node_entry *node; - - list_for_each_entry_rcu(node, node_db, mac_list) { - if (ether_addr_equal(node->MacAddressA, addr)) - return node; - } + /* Remove inconsistency where + * seq_nr_after(a, b) == seq_nr_before(a, b) + */ + if ((int) b - a == 32768) + return false; - return NULL; + return (((s16) (b - a)) < 0); } +#define seq_nr_before(a, b) seq_nr_after((b), (a)) +#define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b))) +#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) -/* Search for mac entry. Caller must hold rcu read lock. - */ -static struct node_entry *find_node_by_AddrB(struct list_head *node_db, - const unsigned char addr[ETH_ALEN]) +bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { - struct node_entry *node; + struct hsr_node *node; - list_for_each_entry_rcu(node, node_db, mac_list) { - if (ether_addr_equal(node->MacAddressB, addr)) - return node; + node = list_first_or_null_rcu(&hsr->self_node_db, struct hsr_node, + mac_list); + if (!node) { + WARN_ONCE(1, "HSR: No self node\n"); + return false; } - return NULL; -} + if (ether_addr_equal(addr, node->MacAddressA)) + return true; + if (ether_addr_equal(addr, node->MacAddressB)) + return true; + return false; +} /* Search for mac entry. Caller must hold rcu read lock. */ -struct node_entry *hsr_find_node(struct list_head *node_db, struct sk_buff *skb) +static struct hsr_node *find_node_by_AddrA(struct list_head *node_db, + const unsigned char addr[ETH_ALEN]) { - struct node_entry *node; - struct ethhdr *ethhdr; - - if (!skb_mac_header_was_set(skb)) - return NULL; - - ethhdr = (struct ethhdr *) skb_mac_header(skb); + struct hsr_node *node; list_for_each_entry_rcu(node, node_db, mac_list) { - if (ether_addr_equal(node->MacAddressA, ethhdr->h_source)) - return node; - if (ether_addr_equal(node->MacAddressB, ethhdr->h_source)) + if (ether_addr_equal(node->MacAddressA, addr)) return node; } @@ -102,7 +99,7 @@ int hsr_create_self_node(struct list_head *self_node_db, unsigned char addr_a[ETH_ALEN], unsigned char addr_b[ETH_ALEN]) { - struct node_entry *node, *oldnode; + struct hsr_node *node, *oldnode; node = kmalloc(sizeof(*node), GFP_KERNEL); if (!node) @@ -113,7 +110,7 @@ int hsr_create_self_node(struct list_head *self_node_db, rcu_read_lock(); oldnode = list_first_or_null_rcu(self_node_db, - struct node_entry, mac_list); + struct hsr_node, mac_list); if (oldnode) { list_replace_rcu(&oldnode->mac_list, &node->mac_list); rcu_read_unlock(); @@ -128,135 +125,144 @@ int hsr_create_self_node(struct list_head *self_node_db, } -/* Add/merge node to the database of nodes. 'skb' must contain an HSR - * supervision frame. - * - If the supervision header's MacAddressA field is not yet in the database, - * this frame is from an hitherto unknown node - add it to the database. - * - If the sender's MAC address is not the same as its MacAddressA address, - * the node is using PICS_SUBS (address substitution). Record the sender's - * address as the node's MacAddressB. - * - * This function needs to work even if the sender node has changed one of its - * slaves' MAC addresses. In this case, there are four different cases described - * by (Addr-changed, received-from) pairs as follows. Note that changing the - * SlaveA address is equal to changing the node's own address: - * - * - (AddrB, SlaveB): The new AddrB will be recorded by PICS_SUBS code since - * node == NULL. - * - (AddrB, SlaveA): Will work as usual (the AddrB change won't be detected - * from this frame). - * - * - (AddrA, SlaveB): The old node will be found. We need to detect this and - * remove the node. - * - (AddrA, SlaveA): A new node will be registered (non-PICS_SUBS at first). - * The old one will be pruned after HSR_NODE_FORGET_TIME. - * - * We also need to detect if the sender's SlaveA and SlaveB cables have been - * swapped. +/* Allocate an hsr_node and add it to node_db. 'addr' is the node's AddressA; + * seq_out is used to initialize filtering of outgoing duplicate frames + * originating from the newly added node. */ -struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, - struct node_entry *node, - struct sk_buff *skb, - enum hsr_dev_idx dev_idx) +struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], + u16 seq_out) { - struct hsr_sup_payload *hsr_sp; - struct hsr_ethhdr_sp *hsr_ethsup; - int i; + struct hsr_node *node; unsigned long now; - - hsr_ethsup = (struct hsr_ethhdr_sp *) skb_mac_header(skb); - hsr_sp = (struct hsr_sup_payload *) skb->data; - - if (node && !ether_addr_equal(node->MacAddressA, hsr_sp->MacAddressA)) { - /* Node has changed its AddrA, frame was received from SlaveB */ - list_del_rcu(&node->mac_list); - kfree_rcu(node, rcu_head); - node = NULL; - } - - if (node && (dev_idx == node->AddrB_if) && - !ether_addr_equal(node->MacAddressB, hsr_ethsup->ethhdr.h_source)) { - /* Cables have been swapped */ - list_del_rcu(&node->mac_list); - kfree_rcu(node, rcu_head); - node = NULL; - } - - if (node && (dev_idx != node->AddrB_if) && - (node->AddrB_if != HSR_DEV_NONE) && - !ether_addr_equal(node->MacAddressA, hsr_ethsup->ethhdr.h_source)) { - /* Cables have been swapped */ - list_del_rcu(&node->mac_list); - kfree_rcu(node, rcu_head); - node = NULL; - } - - if (node) - return node; - - node = find_node_by_AddrA(&hsr_priv->node_db, hsr_sp->MacAddressA); - if (node) { - /* Node is known, but frame was received from an unknown - * address. Node is PICS_SUBS capable; merge its AddrB. - */ - ether_addr_copy(node->MacAddressB, hsr_ethsup->ethhdr.h_source); - node->AddrB_if = dev_idx; - return node; - } + int i; node = kzalloc(sizeof(*node), GFP_ATOMIC); if (!node) return NULL; - ether_addr_copy(node->MacAddressA, hsr_sp->MacAddressA); - ether_addr_copy(node->MacAddressB, hsr_ethsup->ethhdr.h_source); - if (!ether_addr_equal(hsr_sp->MacAddressA, hsr_ethsup->ethhdr.h_source)) - node->AddrB_if = dev_idx; - else - node->AddrB_if = HSR_DEV_NONE; + ether_addr_copy(node->MacAddressA, addr); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; - for (i = 0; i < HSR_MAX_SLAVE; i++) + for (i = 0; i < HSR_PT_PORTS; i++) node->time_in[i] = now; - for (i = 0; i < HSR_MAX_DEV; i++) - node->seq_out[i] = ntohs(hsr_ethsup->hsr_sup.sequence_nr) - 1; + for (i = 0; i < HSR_PT_PORTS; i++) + node->seq_out[i] = seq_out; - list_add_tail_rcu(&node->mac_list, &hsr_priv->node_db); + list_add_tail_rcu(&node->mac_list, node_db); return node; } +/* Get the hsr_node from which 'skb' was sent. + */ +struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb, + bool is_sup) +{ + struct hsr_node *node; + struct ethhdr *ethhdr; + u16 seq_out; + + if (!skb_mac_header_was_set(skb)) + return NULL; + + ethhdr = (struct ethhdr *) skb_mac_header(skb); + + list_for_each_entry_rcu(node, node_db, mac_list) { + if (ether_addr_equal(node->MacAddressA, ethhdr->h_source)) + return node; + if (ether_addr_equal(node->MacAddressB, ethhdr->h_source)) + return node; + } + + if (!is_sup) + return NULL; /* Only supervision frame may create node entry */ + + if (ethhdr->h_proto == htons(ETH_P_PRP)) { + /* Use the existing sequence_nr from the tag as starting point + * for filtering duplicate frames. + */ + seq_out = hsr_get_skb_sequence_nr(skb) - 1; + } else { + WARN_ONCE(1, "%s: Non-HSR frame\n", __func__); + seq_out = 0; + } + + return hsr_add_node(node_db, ethhdr->h_source, seq_out); +} + +/* Use the Supervision frame's info about an eventual MacAddressB for merging + * nodes that has previously had their MacAddressB registered as a separate + * node. + */ +void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, + struct hsr_port *port_rcv) +{ + struct hsr_node *node_real; + struct hsr_sup_payload *hsr_sp; + struct list_head *node_db; + int i; + + skb_pull(skb, sizeof(struct hsr_ethhdr_sp)); + hsr_sp = (struct hsr_sup_payload *) skb->data; + + if (ether_addr_equal(eth_hdr(skb)->h_source, hsr_sp->MacAddressA)) + /* Not sent from MacAddressB of a PICS_SUBS capable node */ + goto done; + + /* Merge node_curr (registered on MacAddressB) into node_real */ + node_db = &port_rcv->hsr->node_db; + node_real = find_node_by_AddrA(node_db, hsr_sp->MacAddressA); + if (!node_real) + /* No frame received from AddrA of this node yet */ + node_real = hsr_add_node(node_db, hsr_sp->MacAddressA, + HSR_SEQNR_START - 1); + if (!node_real) + goto done; /* No mem */ + if (node_real == node_curr) + /* Node has already been merged */ + goto done; + + ether_addr_copy(node_real->MacAddressB, eth_hdr(skb)->h_source); + for (i = 0; i < HSR_PT_PORTS; i++) { + if (!node_curr->time_in_stale[i] && + time_after(node_curr->time_in[i], node_real->time_in[i])) { + node_real->time_in[i] = node_curr->time_in[i]; + node_real->time_in_stale[i] = node_curr->time_in_stale[i]; + } + if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i])) + node_real->seq_out[i] = node_curr->seq_out[i]; + } + node_real->AddrB_port = port_rcv->type; + + list_del_rcu(&node_curr->mac_list); + kfree_rcu(node_curr, rcu_head); + +done: + skb_push(skb, sizeof(struct hsr_ethhdr_sp)); +} + /* 'skb' is a frame meant for this host, that is to be passed to upper layers. * - * If the frame was sent by a node's B interface, replace the sender + * If the frame was sent by a node's B interface, replace the source * address with that node's "official" address (MacAddressA) so that upper * layers recognize where it came from. */ -void hsr_addr_subst_source(struct hsr_priv *hsr_priv, struct sk_buff *skb) +void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb) { - struct ethhdr *ethhdr; - struct node_entry *node; - if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__); return; } - ethhdr = (struct ethhdr *) skb_mac_header(skb); - rcu_read_lock(); - node = find_node_by_AddrB(&hsr_priv->node_db, ethhdr->h_source); - if (node) - ether_addr_copy(ethhdr->h_source, node->MacAddressA); - rcu_read_unlock(); + memcpy(ð_hdr(skb)->h_source, node->MacAddressA, ETH_ALEN); } - /* 'skb' is a frame meant for another host. - * 'hsr_dev_idx' is the HSR index of the outgoing device + * 'port' is the outgoing interface * * Substitute the target (dest) MAC address if necessary, so the it matches the * recipient interface MAC address, regardless of whether that is the @@ -264,47 +270,44 @@ void hsr_addr_subst_source(struct hsr_priv *hsr_priv, struct sk_buff *skb) * This is needed to keep the packets flowing through switches that learn on * which "side" the different interfaces are. */ -void hsr_addr_subst_dest(struct hsr_priv *hsr_priv, struct ethhdr *ethhdr, - enum hsr_dev_idx dev_idx) +void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, + struct hsr_port *port) { - struct node_entry *node; + struct hsr_node *node_dst; - rcu_read_lock(); - node = find_node_by_AddrA(&hsr_priv->node_db, ethhdr->h_dest); - if (node && (node->AddrB_if == dev_idx)) - ether_addr_copy(ethhdr->h_dest, node->MacAddressB); - rcu_read_unlock(); -} + if (!skb_mac_header_was_set(skb)) { + WARN_ONCE(1, "%s: Mac header not set\n", __func__); + return; + } + if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest)) + return; -/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, - * false otherwise. - */ -static bool seq_nr_after(u16 a, u16 b) -{ - /* Remove inconsistency where - * seq_nr_after(a, b) == seq_nr_before(a, b) - */ - if ((int) b - a == 32768) - return false; + node_dst = find_node_by_AddrA(&port->hsr->node_db, eth_hdr(skb)->h_dest); + if (!node_dst) { + WARN_ONCE(1, "%s: Unknown node\n", __func__); + return; + } + if (port->type != node_dst->AddrB_port) + return; - return (((s16) (b - a)) < 0); + ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->MacAddressB); } -#define seq_nr_before(a, b) seq_nr_after((b), (a)) -#define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b))) -#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) -void hsr_register_frame_in(struct node_entry *node, enum hsr_dev_idx dev_idx) +void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, + u16 sequence_nr) { - if ((dev_idx < 0) || (dev_idx >= HSR_MAX_SLAVE)) { - WARN_ONCE(1, "%s: Invalid dev_idx (%d)\n", __func__, dev_idx); + /* Don't register incoming frames without a valid sequence number. This + * ensures entries of restarted nodes gets pruned so that they can + * re-register and resume communications. + */ + if (seq_nr_before(sequence_nr, node->seq_out[port->type])) return; - } - node->time_in[dev_idx] = jiffies; - node->time_in_stale[dev_idx] = false; -} + node->time_in[port->type] = jiffies; + node->time_in_stale[port->type] = false; +} /* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid * ethhdr->h_source address and skb->mac_header set. @@ -314,102 +317,87 @@ void hsr_register_frame_in(struct node_entry *node, enum hsr_dev_idx dev_idx) * 0 otherwise, or * negative error code on error */ -int hsr_register_frame_out(struct node_entry *node, enum hsr_dev_idx dev_idx, - struct sk_buff *skb) +int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, + u16 sequence_nr) { - struct hsr_ethhdr *hsr_ethhdr; - u16 sequence_nr; - - if ((dev_idx < 0) || (dev_idx >= HSR_MAX_DEV)) { - WARN_ONCE(1, "%s: Invalid dev_idx (%d)\n", __func__, dev_idx); - return -EINVAL; - } - if (!skb_mac_header_was_set(skb)) { - WARN_ONCE(1, "%s: Mac header not set\n", __func__); - return -EINVAL; - } - hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb); - - sequence_nr = ntohs(hsr_ethhdr->hsr_tag.sequence_nr); - if (seq_nr_before_or_eq(sequence_nr, node->seq_out[dev_idx])) + if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type])) return 1; - node->seq_out[dev_idx] = sequence_nr; + node->seq_out[port->type] = sequence_nr; return 0; } - -static bool is_late(struct node_entry *node, enum hsr_dev_idx dev_idx) +static struct hsr_port *get_late_port(struct hsr_priv *hsr, + struct hsr_node *node) { - enum hsr_dev_idx other; - - if (node->time_in_stale[dev_idx]) - return true; - - if (dev_idx == HSR_DEV_SLAVE_A) - other = HSR_DEV_SLAVE_B; - else - other = HSR_DEV_SLAVE_A; - - if (node->time_in_stale[other]) - return false; + if (node->time_in_stale[HSR_PT_SLAVE_A]) + return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); + if (node->time_in_stale[HSR_PT_SLAVE_B]) + return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + + if (time_after(node->time_in[HSR_PT_SLAVE_B], + node->time_in[HSR_PT_SLAVE_A] + + msecs_to_jiffies(MAX_SLAVE_DIFF))) + return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); + if (time_after(node->time_in[HSR_PT_SLAVE_A], + node->time_in[HSR_PT_SLAVE_B] + + msecs_to_jiffies(MAX_SLAVE_DIFF))) + return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); - if (time_after(node->time_in[other], node->time_in[dev_idx] + - msecs_to_jiffies(MAX_SLAVE_DIFF))) - return true; - - return false; + return NULL; } /* Remove stale sequence_nr records. Called by timer every * HSR_LIFE_CHECK_INTERVAL (two seconds or so). */ -void hsr_prune_nodes(struct hsr_priv *hsr_priv) +void hsr_prune_nodes(unsigned long data) { - struct node_entry *node; + struct hsr_priv *hsr; + struct hsr_node *node; + struct hsr_port *port; unsigned long timestamp; unsigned long time_a, time_b; + hsr = (struct hsr_priv *) data; + rcu_read_lock(); - list_for_each_entry_rcu(node, &hsr_priv->node_db, mac_list) { + list_for_each_entry_rcu(node, &hsr->node_db, mac_list) { /* Shorthand */ - time_a = node->time_in[HSR_DEV_SLAVE_A]; - time_b = node->time_in[HSR_DEV_SLAVE_B]; + time_a = node->time_in[HSR_PT_SLAVE_A]; + time_b = node->time_in[HSR_PT_SLAVE_B]; /* Check for timestamps old enough to risk wrap-around */ if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET/2)) - node->time_in_stale[HSR_DEV_SLAVE_A] = true; + node->time_in_stale[HSR_PT_SLAVE_A] = true; if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET/2)) - node->time_in_stale[HSR_DEV_SLAVE_B] = true; + node->time_in_stale[HSR_PT_SLAVE_B] = true; /* Get age of newest frame from node. * At least one time_in is OK here; nodes get pruned long * before both time_ins can get stale */ timestamp = time_a; - if (node->time_in_stale[HSR_DEV_SLAVE_A] || - (!node->time_in_stale[HSR_DEV_SLAVE_B] && + if (node->time_in_stale[HSR_PT_SLAVE_A] || + (!node->time_in_stale[HSR_PT_SLAVE_B] && time_after(time_b, time_a))) timestamp = time_b; /* Warn of ring error only as long as we get frames at all */ if (time_is_after_jiffies(timestamp + msecs_to_jiffies(1.5*MAX_SLAVE_DIFF))) { - - if (is_late(node, HSR_DEV_SLAVE_A)) - hsr_nl_ringerror(hsr_priv, node->MacAddressA, - HSR_DEV_SLAVE_A); - else if (is_late(node, HSR_DEV_SLAVE_B)) - hsr_nl_ringerror(hsr_priv, node->MacAddressA, - HSR_DEV_SLAVE_B); + rcu_read_lock(); + port = get_late_port(hsr, node); + if (port != NULL) + hsr_nl_ringerror(hsr, node->MacAddressA, port); + rcu_read_unlock(); } /* Prune old entries */ if (time_is_before_jiffies(timestamp + msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { - hsr_nl_nodedown(hsr_priv, node->MacAddressA); + hsr_nl_nodedown(hsr, node->MacAddressA); list_del_rcu(&node->mac_list); /* Note that we need to free this entry later: */ kfree_rcu(node, rcu_head); @@ -419,21 +407,21 @@ void hsr_prune_nodes(struct hsr_priv *hsr_priv) } -void *hsr_get_next_node(struct hsr_priv *hsr_priv, void *_pos, +void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]) { - struct node_entry *node; + struct hsr_node *node; if (!_pos) { - node = list_first_or_null_rcu(&hsr_priv->node_db, - struct node_entry, mac_list); + node = list_first_or_null_rcu(&hsr->node_db, + struct hsr_node, mac_list); if (node) ether_addr_copy(addr, node->MacAddressA); return node; } node = _pos; - list_for_each_entry_continue_rcu(node, &hsr_priv->node_db, mac_list) { + list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) { ether_addr_copy(addr, node->MacAddressA); return node; } @@ -442,7 +430,7 @@ void *hsr_get_next_node(struct hsr_priv *hsr_priv, void *_pos, } -int hsr_get_node_data(struct hsr_priv *hsr_priv, +int hsr_get_node_data(struct hsr_priv *hsr, const unsigned char *addr, unsigned char addr_b[ETH_ALEN], unsigned int *addr_b_ifindex, @@ -451,12 +439,13 @@ int hsr_get_node_data(struct hsr_priv *hsr_priv, int *if2_age, u16 *if2_seq) { - struct node_entry *node; + struct hsr_node *node; + struct hsr_port *port; unsigned long tdiff; rcu_read_lock(); - node = find_node_by_AddrA(&hsr_priv->node_db, addr); + node = find_node_by_AddrA(&hsr->node_db, addr); if (!node) { rcu_read_unlock(); return -ENOENT; /* No such entry */ @@ -464,8 +453,8 @@ int hsr_get_node_data(struct hsr_priv *hsr_priv, ether_addr_copy(addr_b, node->MacAddressB); - tdiff = jiffies - node->time_in[HSR_DEV_SLAVE_A]; - if (node->time_in_stale[HSR_DEV_SLAVE_A]) + tdiff = jiffies - node->time_in[HSR_PT_SLAVE_A]; + if (node->time_in_stale[HSR_PT_SLAVE_A]) *if1_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) @@ -474,8 +463,8 @@ int hsr_get_node_data(struct hsr_priv *hsr_priv, else *if1_age = jiffies_to_msecs(tdiff); - tdiff = jiffies - node->time_in[HSR_DEV_SLAVE_B]; - if (node->time_in_stale[HSR_DEV_SLAVE_B]) + tdiff = jiffies - node->time_in[HSR_PT_SLAVE_B]; + if (node->time_in_stale[HSR_PT_SLAVE_B]) *if2_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) @@ -485,13 +474,15 @@ int hsr_get_node_data(struct hsr_priv *hsr_priv, *if2_age = jiffies_to_msecs(tdiff); /* Present sequence numbers as if they were incoming on interface */ - *if1_seq = node->seq_out[HSR_DEV_SLAVE_B]; - *if2_seq = node->seq_out[HSR_DEV_SLAVE_A]; + *if1_seq = node->seq_out[HSR_PT_SLAVE_B]; + *if2_seq = node->seq_out[HSR_PT_SLAVE_A]; - if ((node->AddrB_if != HSR_DEV_NONE) && hsr_priv->slave[node->AddrB_if]) - *addr_b_ifindex = hsr_priv->slave[node->AddrB_if]->ifindex; - else + if (node->AddrB_port != HSR_PT_NONE) { + port = hsr_port_get_hsr(hsr, node->AddrB_port); + *addr_b_ifindex = port->dev->ifindex; + } else { *addr_b_ifindex = -1; + } rcu_read_unlock(); diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index e6c4022030ad..438b40f98f5a 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,42 +6,43 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ -#ifndef _HSR_FRAMEREG_H -#define _HSR_FRAMEREG_H +#ifndef __HSR_FRAMEREG_H +#define __HSR_FRAMEREG_H #include "hsr_main.h" -struct node_entry; +struct hsr_node; -struct node_entry *hsr_find_node(struct list_head *node_db, struct sk_buff *skb); +struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], + u16 seq_out); +struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb, + bool is_sup); +void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, + struct hsr_port *port); +bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr); -struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, - struct node_entry *node, - struct sk_buff *skb, - enum hsr_dev_idx dev_idx); +void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb); +void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, + struct hsr_port *port); -void hsr_addr_subst_source(struct hsr_priv *hsr_priv, struct sk_buff *skb); -void hsr_addr_subst_dest(struct hsr_priv *hsr_priv, struct ethhdr *ethhdr, - enum hsr_dev_idx dev_idx); +void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, + u16 sequence_nr); +int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, + u16 sequence_nr); -void hsr_register_frame_in(struct node_entry *node, enum hsr_dev_idx dev_idx); - -int hsr_register_frame_out(struct node_entry *node, enum hsr_dev_idx dev_idx, - struct sk_buff *skb); - -void hsr_prune_nodes(struct hsr_priv *hsr_priv); +void hsr_prune_nodes(unsigned long data); int hsr_create_self_node(struct list_head *self_node_db, unsigned char addr_a[ETH_ALEN], unsigned char addr_b[ETH_ALEN]); -void *hsr_get_next_node(struct hsr_priv *hsr_priv, void *_pos, +void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]); -int hsr_get_node_data(struct hsr_priv *hsr_priv, +int hsr_get_node_data(struct hsr_priv *hsr, const unsigned char *addr, unsigned char addr_b[ETH_ALEN], unsigned int *addr_b_ifindex, @@ -50,4 +51,4 @@ int hsr_get_node_data(struct hsr_priv *hsr_priv, int *if2_age, u16 *if2_seq); -#endif /* _HSR_FRAMEREG_H */ +#endif /* __HSR_FRAMEREG_H */ diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 3fee5218a691..779d28b65417 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,11 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com - * - * In addition to routines for registering and unregistering HSR support, this - * file also contains the receive routine that handles all incoming frames with - * Ethertype (protocol) ETH_P_PRP (HSRv0), and network device event handling. + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ #include <linux/netdevice.h> @@ -21,154 +17,71 @@ #include "hsr_device.h" #include "hsr_netlink.h" #include "hsr_framereg.h" - - -/* List of all registered virtual HSR devices */ -static LIST_HEAD(hsr_list); - -void register_hsr_master(struct hsr_priv *hsr_priv) -{ - list_add_tail_rcu(&hsr_priv->hsr_list, &hsr_list); -} - -void unregister_hsr_master(struct hsr_priv *hsr_priv) -{ - struct hsr_priv *hsr_priv_it; - - list_for_each_entry(hsr_priv_it, &hsr_list, hsr_list) - if (hsr_priv_it == hsr_priv) { - list_del_rcu(&hsr_priv_it->hsr_list); - return; - } -} - -bool is_hsr_slave(struct net_device *dev) -{ - struct hsr_priv *hsr_priv_it; - - list_for_each_entry_rcu(hsr_priv_it, &hsr_list, hsr_list) { - if (dev == hsr_priv_it->slave[0]) - return true; - if (dev == hsr_priv_it->slave[1]) - return true; - } - - return false; -} - - -/* If dev is a HSR slave device, return the virtual master device. Return NULL - * otherwise. - */ -static struct hsr_priv *get_hsr_master(struct net_device *dev) -{ - struct hsr_priv *hsr_priv; - - rcu_read_lock(); - list_for_each_entry_rcu(hsr_priv, &hsr_list, hsr_list) - if ((dev == hsr_priv->slave[0]) || - (dev == hsr_priv->slave[1])) { - rcu_read_unlock(); - return hsr_priv; - } - - rcu_read_unlock(); - return NULL; -} - - -/* If dev is a HSR slave device, return the other slave device. Return NULL - * otherwise. - */ -static struct net_device *get_other_slave(struct hsr_priv *hsr_priv, - struct net_device *dev) -{ - if (dev == hsr_priv->slave[0]) - return hsr_priv->slave[1]; - if (dev == hsr_priv->slave[1]) - return hsr_priv->slave[0]; - - return NULL; -} +#include "hsr_slave.h" static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, void *ptr) { - struct net_device *slave, *other_slave; - struct hsr_priv *hsr_priv; - int old_operstate; + struct net_device *dev; + struct hsr_port *port, *master; + struct hsr_priv *hsr; int mtu_max; int res; - struct net_device *dev; dev = netdev_notifier_info_to_dev(ptr); - - hsr_priv = get_hsr_master(dev); - if (hsr_priv) { - /* dev is a slave device */ - slave = dev; - other_slave = get_other_slave(hsr_priv, slave); - } else { + port = hsr_port_get_rtnl(dev); + if (port == NULL) { if (!is_hsr_master(dev)) - return NOTIFY_DONE; - hsr_priv = netdev_priv(dev); - slave = hsr_priv->slave[0]; - other_slave = hsr_priv->slave[1]; + return NOTIFY_DONE; /* Not an HSR device */ + hsr = netdev_priv(dev); + port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + } else { + hsr = port->hsr; } switch (event) { case NETDEV_UP: /* Administrative state DOWN */ case NETDEV_DOWN: /* Administrative state UP */ case NETDEV_CHANGE: /* Link (carrier) state changes */ - old_operstate = hsr_priv->dev->operstate; - hsr_set_carrier(hsr_priv->dev, slave, other_slave); - /* netif_stacked_transfer_operstate() cannot be used here since - * it doesn't set IF_OPER_LOWERLAYERDOWN (?) - */ - hsr_set_operstate(hsr_priv->dev, slave, other_slave); - hsr_check_announce(hsr_priv->dev, old_operstate); + hsr_check_carrier_and_operstate(hsr); break; case NETDEV_CHANGEADDR: - - /* This should not happen since there's no ndo_set_mac_address() - * for HSR devices - i.e. not supported. - */ - if (dev == hsr_priv->dev) + if (port->type == HSR_PT_MASTER) { + /* This should not happen since there's no + * ndo_set_mac_address() for HSR devices - i.e. not + * supported. + */ break; + } - if (dev == hsr_priv->slave[0]) - ether_addr_copy(hsr_priv->dev->dev_addr, - hsr_priv->slave[0]->dev_addr); + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + + if (port->type == HSR_PT_SLAVE_A) { + ether_addr_copy(master->dev->dev_addr, dev->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); + } /* Make sure we recognize frames from ourselves in hsr_rcv() */ - res = hsr_create_self_node(&hsr_priv->self_node_db, - hsr_priv->dev->dev_addr, - hsr_priv->slave[1] ? - hsr_priv->slave[1]->dev_addr : - hsr_priv->dev->dev_addr); + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + res = hsr_create_self_node(&hsr->self_node_db, + master->dev->dev_addr, + port ? + port->dev->dev_addr : + master->dev->dev_addr); if (res) - netdev_warn(hsr_priv->dev, + netdev_warn(master->dev, "Could not update HSR node address.\n"); - - if (dev == hsr_priv->slave[0]) - call_netdevice_notifiers(NETDEV_CHANGEADDR, hsr_priv->dev); break; case NETDEV_CHANGEMTU: - if (dev == hsr_priv->dev) + if (port->type == HSR_PT_MASTER) break; /* Handled in ndo_change_mtu() */ - mtu_max = hsr_get_max_mtu(hsr_priv); - if (hsr_priv->dev->mtu > mtu_max) - dev_set_mtu(hsr_priv->dev, mtu_max); + mtu_max = hsr_get_max_mtu(port->hsr); + master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); + master->dev->mtu = mtu_max; break; case NETDEV_UNREGISTER: - if (dev == hsr_priv->slave[0]) - hsr_priv->slave[0] = NULL; - if (dev == hsr_priv->slave[1]) - hsr_priv->slave[1] = NULL; - - /* There should really be a way to set a new slave device... */ - + hsr_del_port(port); break; case NETDEV_PRE_TYPE_CHANGE: /* HSR works only on Ethernet devices. Refuse slave to change @@ -181,255 +94,16 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, } -static struct timer_list prune_timer; - -static void prune_nodes_all(unsigned long data) -{ - struct hsr_priv *hsr_priv; - - rcu_read_lock(); - list_for_each_entry_rcu(hsr_priv, &hsr_list, hsr_list) - hsr_prune_nodes(hsr_priv); - rcu_read_unlock(); - - prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD); - add_timer(&prune_timer); -} - - -static struct sk_buff *hsr_pull_tag(struct sk_buff *skb) +struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { - struct hsr_tag *hsr_tag; - struct sk_buff *skb2; - - skb2 = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(!skb2)) - goto err_free; - skb = skb2; - - if (unlikely(!pskb_may_pull(skb, HSR_TAGLEN))) - goto err_free; + struct hsr_port *port; - hsr_tag = (struct hsr_tag *) skb->data; - skb->protocol = hsr_tag->encap_proto; - skb_pull(skb, HSR_TAGLEN); - - return skb; - -err_free: - kfree_skb(skb); + hsr_for_each_port(hsr, port) + if (port->type == pt) + return port; return NULL; } - -/* The uses I can see for these HSR supervision frames are: - * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type = - * 22") to reset any sequence_nr counters belonging to that node. Useful if - * the other node's counter has been reset for some reason. - * -- - * Or not - resetting the counter and bridging the frame would create a - * loop, unfortunately. - * - * 2) Use the LifeCheck frames to detect ring breaks. I.e. if no LifeCheck - * frame is received from a particular node, we know something is wrong. - * We just register these (as with normal frames) and throw them away. - * - * 3) Allow different MAC addresses for the two slave interfaces, using the - * MacAddressA field. - */ -static bool is_supervision_frame(struct hsr_priv *hsr_priv, struct sk_buff *skb) -{ - struct hsr_sup_tag *hsr_stag; - - if (!ether_addr_equal(eth_hdr(skb)->h_dest, - hsr_priv->sup_multicast_addr)) - return false; - - hsr_stag = (struct hsr_sup_tag *) skb->data; - if (get_hsr_stag_path(hsr_stag) != 0x0f) - return false; - if ((hsr_stag->HSR_TLV_Type != HSR_TLV_ANNOUNCE) && - (hsr_stag->HSR_TLV_Type != HSR_TLV_LIFE_CHECK)) - return false; - if (hsr_stag->HSR_TLV_Length != 12) - return false; - - return true; -} - - -/* Implementation somewhat according to IEC-62439-3, p. 43 - */ -static int hsr_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) -{ - struct hsr_priv *hsr_priv; - struct net_device *other_slave; - struct node_entry *node; - bool deliver_to_self; - struct sk_buff *skb_deliver; - enum hsr_dev_idx dev_in_idx, dev_other_idx; - bool dup_out; - int ret; - - hsr_priv = get_hsr_master(dev); - - if (!hsr_priv) { - /* Non-HSR-slave device 'dev' is connected to a HSR network */ - kfree_skb(skb); - dev->stats.rx_errors++; - return NET_RX_SUCCESS; - } - - if (dev == hsr_priv->slave[0]) { - dev_in_idx = HSR_DEV_SLAVE_A; - dev_other_idx = HSR_DEV_SLAVE_B; - } else { - dev_in_idx = HSR_DEV_SLAVE_B; - dev_other_idx = HSR_DEV_SLAVE_A; - } - - node = hsr_find_node(&hsr_priv->self_node_db, skb); - if (node) { - /* Always kill frames sent by ourselves */ - kfree_skb(skb); - return NET_RX_SUCCESS; - } - - /* Is this frame a candidate for local reception? */ - deliver_to_self = false; - if ((skb->pkt_type == PACKET_HOST) || - (skb->pkt_type == PACKET_MULTICAST) || - (skb->pkt_type == PACKET_BROADCAST)) - deliver_to_self = true; - else if (ether_addr_equal(eth_hdr(skb)->h_dest, - hsr_priv->dev->dev_addr)) { - skb->pkt_type = PACKET_HOST; - deliver_to_self = true; - } - - - rcu_read_lock(); /* node_db */ - node = hsr_find_node(&hsr_priv->node_db, skb); - - if (is_supervision_frame(hsr_priv, skb)) { - skb_pull(skb, sizeof(struct hsr_sup_tag)); - node = hsr_merge_node(hsr_priv, node, skb, dev_in_idx); - if (!node) { - rcu_read_unlock(); /* node_db */ - kfree_skb(skb); - hsr_priv->dev->stats.rx_dropped++; - return NET_RX_DROP; - } - skb_push(skb, sizeof(struct hsr_sup_tag)); - deliver_to_self = false; - } - - if (!node) { - /* Source node unknown; this might be a HSR frame from - * another net (different multicast address). Ignore it. - */ - rcu_read_unlock(); /* node_db */ - kfree_skb(skb); - return NET_RX_SUCCESS; - } - - /* Register ALL incoming frames as outgoing through the other interface. - * This allows us to register frames as incoming only if they are valid - * for the receiving interface, without using a specific counter for - * incoming frames. - */ - dup_out = hsr_register_frame_out(node, dev_other_idx, skb); - if (!dup_out) - hsr_register_frame_in(node, dev_in_idx); - - /* Forward this frame? */ - if (!dup_out && (skb->pkt_type != PACKET_HOST)) - other_slave = get_other_slave(hsr_priv, dev); - else - other_slave = NULL; - - if (hsr_register_frame_out(node, HSR_DEV_MASTER, skb)) - deliver_to_self = false; - - rcu_read_unlock(); /* node_db */ - - if (!deliver_to_self && !other_slave) { - kfree_skb(skb); - /* Circulated frame; silently remove it. */ - return NET_RX_SUCCESS; - } - - skb_deliver = skb; - if (deliver_to_self && other_slave) { - /* skb_clone() is not enough since we will strip the hsr tag - * and do address substitution below - */ - skb_deliver = pskb_copy(skb, GFP_ATOMIC); - if (!skb_deliver) { - deliver_to_self = false; - hsr_priv->dev->stats.rx_dropped++; - } - } - - if (deliver_to_self) { - bool multicast_frame; - - skb_deliver = hsr_pull_tag(skb_deliver); - if (!skb_deliver) { - hsr_priv->dev->stats.rx_dropped++; - goto forward; - } -#if !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) - /* Move everything in the header that is after the HSR tag, - * to work around alignment problems caused by the 6-byte HSR - * tag. In practice, this removes/overwrites the HSR tag in - * the header and restores a "standard" packet. - */ - memmove(skb_deliver->data - HSR_TAGLEN, skb_deliver->data, - skb_headlen(skb_deliver)); - - /* Adjust skb members so they correspond with the move above. - * This cannot possibly underflow skb->data since hsr_pull_tag() - * above succeeded. - * At this point in the protocol stack, the transport and - * network headers have not been set yet, and we haven't touched - * the mac header nor the head. So we only need to adjust data - * and tail: - */ - skb_deliver->data -= HSR_TAGLEN; - skb_deliver->tail -= HSR_TAGLEN; -#endif - skb_deliver->dev = hsr_priv->dev; - hsr_addr_subst_source(hsr_priv, skb_deliver); - multicast_frame = (skb_deliver->pkt_type == PACKET_MULTICAST); - ret = netif_rx(skb_deliver); - if (ret == NET_RX_DROP) { - hsr_priv->dev->stats.rx_dropped++; - } else { - hsr_priv->dev->stats.rx_packets++; - hsr_priv->dev->stats.rx_bytes += skb->len; - if (multicast_frame) - hsr_priv->dev->stats.multicast++; - } - } - -forward: - if (other_slave) { - skb_push(skb, ETH_HLEN); - skb->dev = other_slave; - dev_queue_xmit(skb); - } - - return NET_RX_SUCCESS; -} - - -static struct packet_type hsr_pt __read_mostly = { - .type = htons(ETH_P_PRP), - .func = hsr_rcv, -}; - static struct notifier_block hsr_nb = { .notifier_call = hsr_netdev_notify, /* Slave event notifications */ }; @@ -439,18 +113,9 @@ static int __init hsr_init(void) { int res; - BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_TAGLEN); - - dev_add_pack(&hsr_pt); - - init_timer(&prune_timer); - prune_timer.function = prune_nodes_all; - prune_timer.data = 0; - prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD); - add_timer(&prune_timer); + BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_HLEN); register_netdevice_notifier(&hsr_nb); - res = hsr_netlink_init(); return res; @@ -459,9 +124,7 @@ static int __init hsr_init(void) static void __exit hsr_exit(void) { unregister_netdevice_notifier(&hsr_nb); - del_timer_sync(&prune_timer); hsr_netlink_exit(); - dev_remove_pack(&hsr_pt); } module_init(hsr_init); diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 56fe060c0ab1..5a9c69962ded 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,11 +6,11 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ -#ifndef _HSR_PRIVATE_H -#define _HSR_PRIVATE_H +#ifndef __HSR_PRIVATE_H +#define __HSR_PRIVATE_H #include <linux/netdevice.h> #include <linux/list.h> @@ -29,6 +29,7 @@ * each node differ before we notify of communication problem? */ #define MAX_SLAVE_DIFF 3000 /* ms */ +#define HSR_SEQNR_START (USHRT_MAX - 1024) /* How often shall we check for broken ring and remove node entries older than @@ -46,16 +47,16 @@ * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest, * h_source, h_proto = 0x88FB }, and add { path, LSDU_size, sequence Nr, * encapsulated protocol } instead. + * + * Field names as defined in the IEC:2010 standard for HSR. */ -#define HSR_TAGLEN 6 - -/* Field names below as defined in the IEC:2010 standard for HSR. */ struct hsr_tag { __be16 path_and_LSDU_size; __be16 sequence_nr; __be16 encap_proto; } __packed; +#define HSR_HLEN 6 /* The helper functions below assumes that 'path' occupies the 4 most * significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or @@ -136,31 +137,47 @@ struct hsr_ethhdr_sp { } __packed; -enum hsr_dev_idx { - HSR_DEV_NONE = -1, - HSR_DEV_SLAVE_A = 0, - HSR_DEV_SLAVE_B, - HSR_DEV_MASTER, +enum hsr_port_type { + HSR_PT_NONE = 0, /* Must be 0, used by framereg */ + HSR_PT_SLAVE_A, + HSR_PT_SLAVE_B, + HSR_PT_INTERLINK, + HSR_PT_MASTER, + HSR_PT_PORTS, /* This must be the last item in the enum */ +}; + +struct hsr_port { + struct list_head port_list; + struct net_device *dev; + struct hsr_priv *hsr; + enum hsr_port_type type; }; -#define HSR_MAX_SLAVE (HSR_DEV_SLAVE_B + 1) -#define HSR_MAX_DEV (HSR_DEV_MASTER + 1) struct hsr_priv { - struct list_head hsr_list; /* List of hsr devices */ struct rcu_head rcu_head; - struct net_device *dev; - struct net_device *slave[HSR_MAX_SLAVE]; - struct list_head node_db; /* Other HSR nodes */ + struct list_head ports; + struct list_head node_db; /* Known HSR nodes */ struct list_head self_node_db; /* MACs of slaves */ struct timer_list announce_timer; /* Supervision frame dispatch */ + struct timer_list prune_timer; int announce_count; u16 sequence_nr; spinlock_t seqnr_lock; /* locking for sequence_nr */ unsigned char sup_multicast_addr[ETH_ALEN]; }; -void register_hsr_master(struct hsr_priv *hsr_priv); -void unregister_hsr_master(struct hsr_priv *hsr_priv); -bool is_hsr_slave(struct net_device *dev); +#define hsr_for_each_port(hsr, port) \ + list_for_each_entry_rcu((port), &(hsr)->ports, port_list) + +struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt); + +/* Caller must ensure skb is a valid HSR frame */ +static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) +{ + struct hsr_ethhdr *hsr_ethhdr; + + hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb); + return ntohs(hsr_ethhdr->hsr_tag.sequence_nr); +} -#endif /* _HSR_PRIVATE_H */ +#endif /* __HSR_PRIVATE_H */ diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 01a5261ac7a5..a2c7e4c0ac1e 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,7 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Routines for handling Netlink messages for HSR. */ @@ -37,13 +37,17 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, struct net_device *link[2]; unsigned char multicast_spec; + if (!data) { + netdev_info(dev, "HSR: No slave devices specified\n"); + return -EINVAL; + } if (!data[IFLA_HSR_SLAVE1]) { - netdev_info(dev, "IFLA_HSR_SLAVE1 missing!\n"); + netdev_info(dev, "HSR: Slave1 device not specified\n"); return -EINVAL; } link[0] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE1])); if (!data[IFLA_HSR_SLAVE2]) { - netdev_info(dev, "IFLA_HSR_SLAVE2 missing!\n"); + netdev_info(dev, "HSR: Slave2 device not specified\n"); return -EINVAL; } link[1] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE2])); @@ -63,21 +67,33 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) { - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; + struct hsr_port *port; + int res; - hsr_priv = netdev_priv(dev); + hsr = netdev_priv(dev); - if (hsr_priv->slave[0]) - if (nla_put_u32(skb, IFLA_HSR_SLAVE1, hsr_priv->slave[0]->ifindex)) - goto nla_put_failure; + res = 0; - if (hsr_priv->slave[1]) - if (nla_put_u32(skb, IFLA_HSR_SLAVE2, hsr_priv->slave[1]->ifindex)) - goto nla_put_failure; + rcu_read_lock(); + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); + if (port) + res = nla_put_u32(skb, IFLA_HSR_SLAVE1, port->dev->ifindex); + rcu_read_unlock(); + if (res) + goto nla_put_failure; + + rcu_read_lock(); + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + if (port) + res = nla_put_u32(skb, IFLA_HSR_SLAVE2, port->dev->ifindex); + rcu_read_unlock(); + if (res) + goto nla_put_failure; if (nla_put(skb, IFLA_HSR_SUPERVISION_ADDR, ETH_ALEN, - hsr_priv->sup_multicast_addr) || - nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr_priv->sequence_nr)) + hsr->sup_multicast_addr) || + nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr)) goto nla_put_failure; return 0; @@ -128,13 +144,13 @@ static const struct genl_multicast_group hsr_mcgrps[] = { * over one of the slave interfaces. This would indicate an open network ring * (i.e. a link has failed somewhere). */ -void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN], - enum hsr_dev_idx dev_idx) +void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN], + struct hsr_port *port) { struct sk_buff *skb; void *msg_head; + struct hsr_port *master; int res; - int ifindex; skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb) @@ -148,11 +164,7 @@ void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN], if (res < 0) goto nla_put_failure; - if (hsr_priv->slave[dev_idx]) - ifindex = hsr_priv->slave[dev_idx]->ifindex; - else - ifindex = -1; - res = nla_put_u32(skb, HSR_A_IFINDEX, ifindex); + res = nla_put_u32(skb, HSR_A_IFINDEX, port->dev->ifindex); if (res < 0) goto nla_put_failure; @@ -165,16 +177,20 @@ nla_put_failure: kfree_skb(skb); fail: - netdev_warn(hsr_priv->dev, "Could not send HSR ring error message\n"); + rcu_read_lock(); + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + netdev_warn(master->dev, "Could not send HSR ring error message\n"); + rcu_read_unlock(); } /* This is called when we haven't heard from the node with MAC address addr for * some time (just before the node is removed from the node table/list). */ -void hsr_nl_nodedown(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN]) +void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN]) { struct sk_buff *skb; void *msg_head; + struct hsr_port *master; int res; skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); @@ -199,7 +215,10 @@ nla_put_failure: kfree_skb(skb); fail: - netdev_warn(hsr_priv->dev, "Could not send HSR node down\n"); + rcu_read_lock(); + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + netdev_warn(master->dev, "Could not send HSR node down\n"); + rcu_read_unlock(); } @@ -220,7 +239,8 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) /* For sending */ struct sk_buff *skb_out; void *msg_head; - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; + struct hsr_port *port; unsigned char hsr_node_addr_b[ETH_ALEN]; int hsr_node_if1_age; u16 hsr_node_if1_seq; @@ -267,8 +287,8 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) if (res < 0) goto nla_put_failure; - hsr_priv = netdev_priv(hsr_dev); - res = hsr_get_node_data(hsr_priv, + hsr = netdev_priv(hsr_dev); + res = hsr_get_node_data(hsr, (unsigned char *) nla_data(info->attrs[HSR_A_NODE_ADDR]), hsr_node_addr_b, &addr_b_ifindex, @@ -301,9 +321,12 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq); if (res < 0) goto nla_put_failure; - if (hsr_priv->slave[0]) + rcu_read_lock(); + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); + if (port) res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX, - hsr_priv->slave[0]->ifindex); + port->dev->ifindex); + rcu_read_unlock(); if (res < 0) goto nla_put_failure; @@ -313,9 +336,14 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq); if (res < 0) goto nla_put_failure; - if (hsr_priv->slave[1]) + rcu_read_lock(); + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + if (port) res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX, - hsr_priv->slave[1]->ifindex); + port->dev->ifindex); + rcu_read_unlock(); + if (res < 0) + goto nla_put_failure; genlmsg_end(skb_out, msg_head); genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid); @@ -334,7 +362,7 @@ fail: return res; } -/* Get a list of MacAddressA of all nodes known to this node (other than self). +/* Get a list of MacAddressA of all nodes known to this node (including self). */ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) { @@ -345,7 +373,7 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) /* For sending */ struct sk_buff *skb_out; void *msg_head; - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; void *pos; unsigned char addr[ETH_ALEN]; int res; @@ -385,17 +413,17 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) if (res < 0) goto nla_put_failure; - hsr_priv = netdev_priv(hsr_dev); + hsr = netdev_priv(hsr_dev); rcu_read_lock(); - pos = hsr_get_next_node(hsr_priv, NULL, addr); + pos = hsr_get_next_node(hsr, NULL, addr); while (pos) { res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr); if (res < 0) { rcu_read_unlock(); goto nla_put_failure; } - pos = hsr_get_next_node(hsr_priv, pos, addr); + pos = hsr_get_next_node(hsr, pos, addr); } rcu_read_unlock(); diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h index d4579dcc3c7d..3f6b95b5b6b8 100644 --- a/net/hsr/hsr_netlink.h +++ b/net/hsr/hsr_netlink.h @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,7 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ #ifndef __HSR_NETLINK_H @@ -17,13 +17,14 @@ #include <uapi/linux/hsr_netlink.h> struct hsr_priv; +struct hsr_port; int __init hsr_netlink_init(void); void __exit hsr_netlink_exit(void); -void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN], - int dev_idx); -void hsr_nl_nodedown(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN]); +void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN], + struct hsr_port *port); +void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN]); void hsr_nl_framedrop(int dropcount, int dev_idx); void hsr_nl_linkdown(int dev_idx); diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c new file mode 100644 index 000000000000..a348dcbcd683 --- /dev/null +++ b/net/hsr/hsr_slave.c @@ -0,0 +1,196 @@ +/* Copyright 2011-2014 Autronica Fire and Security AS + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Author(s): + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + */ + +#include "hsr_slave.h" +#include <linux/etherdevice.h> +#include <linux/if_arp.h> +#include "hsr_main.h" +#include "hsr_device.h" +#include "hsr_forward.h" +#include "hsr_framereg.h" + + +static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + struct hsr_port *port; + + if (!skb_mac_header_was_set(skb)) { + WARN_ONCE(1, "%s: skb invalid", __func__); + return RX_HANDLER_PASS; + } + + rcu_read_lock(); /* hsr->node_db, hsr->ports */ + port = hsr_port_get_rcu(skb->dev); + + if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { + /* Directly kill frames sent by ourselves */ + kfree_skb(skb); + goto finish_consume; + } + + if (eth_hdr(skb)->h_proto != htons(ETH_P_PRP)) + goto finish_pass; + + skb_push(skb, ETH_HLEN); + + hsr_forward_skb(skb, port); + +finish_consume: + rcu_read_unlock(); /* hsr->node_db, hsr->ports */ + return RX_HANDLER_CONSUMED; + +finish_pass: + rcu_read_unlock(); /* hsr->node_db, hsr->ports */ + return RX_HANDLER_PASS; +} + +bool hsr_port_exists(const struct net_device *dev) +{ + return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame; +} + + +static int hsr_check_dev_ok(struct net_device *dev) +{ + /* Don't allow HSR on non-ethernet like devices */ + if ((dev->flags & IFF_LOOPBACK) || (dev->type != ARPHRD_ETHER) || + (dev->addr_len != ETH_ALEN)) { + netdev_info(dev, "Cannot use loopback or non-ethernet device as HSR slave.\n"); + return -EINVAL; + } + + /* Don't allow enslaving hsr devices */ + if (is_hsr_master(dev)) { + netdev_info(dev, "Cannot create trees of HSR devices.\n"); + return -EINVAL; + } + + if (hsr_port_exists(dev)) { + netdev_info(dev, "This device is already a HSR slave.\n"); + return -EINVAL; + } + + if (dev->priv_flags & IFF_802_1Q_VLAN) { + netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n"); + return -EINVAL; + } + + if (dev->priv_flags & IFF_DONT_BRIDGE) { + netdev_info(dev, "This device does not support bridging.\n"); + return -EOPNOTSUPP; + } + + /* HSR over bonded devices has not been tested, but I'm not sure it + * won't work... + */ + + return 0; +} + + +/* Setup device to be added to the HSR bridge. */ +static int hsr_portdev_setup(struct net_device *dev, struct hsr_port *port) +{ + int res; + + dev_hold(dev); + res = dev_set_promiscuity(dev, 1); + if (res) + goto fail_promiscuity; + + /* FIXME: + * What does net device "adjacency" mean? Should we do + * res = netdev_master_upper_dev_link(port->dev, port->hsr->dev); ? + */ + + res = netdev_rx_handler_register(dev, hsr_handle_frame, port); + if (res) + goto fail_rx_handler; + dev_disable_lro(dev); + + return 0; + +fail_rx_handler: + dev_set_promiscuity(dev, -1); +fail_promiscuity: + dev_put(dev); + + return res; +} + +int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, + enum hsr_port_type type) +{ + struct hsr_port *port, *master; + int res; + + if (type != HSR_PT_MASTER) { + res = hsr_check_dev_ok(dev); + if (res) + return res; + } + + port = hsr_port_get_hsr(hsr, type); + if (port != NULL) + return -EBUSY; /* This port already exists */ + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (port == NULL) + return -ENOMEM; + + if (type != HSR_PT_MASTER) { + res = hsr_portdev_setup(dev, port); + if (res) + goto fail_dev_setup; + } + + port->hsr = hsr; + port->dev = dev; + port->type = type; + + list_add_tail_rcu(&port->port_list, &hsr->ports); + synchronize_rcu(); + + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + netdev_update_features(master->dev); + dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); + + return 0; + +fail_dev_setup: + kfree(port); + return res; +} + +void hsr_del_port(struct hsr_port *port) +{ + struct hsr_priv *hsr; + struct hsr_port *master; + + hsr = port->hsr; + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + list_del_rcu(&port->port_list); + + if (port != master) { + netdev_update_features(master->dev); + dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); + netdev_rx_handler_unregister(port->dev); + dev_set_promiscuity(port->dev, -1); + } + + /* FIXME? + * netdev_upper_dev_unlink(port->dev, port->hsr->dev); + */ + + synchronize_rcu(); + dev_put(port->dev); +} diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h new file mode 100644 index 000000000000..3ccfbf71c92e --- /dev/null +++ b/net/hsr/hsr_slave.h @@ -0,0 +1,38 @@ +/* Copyright 2011-2014 Autronica Fire and Security AS + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Author(s): + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + */ + +#ifndef __HSR_SLAVE_H +#define __HSR_SLAVE_H + +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include "hsr_main.h" + +int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, + enum hsr_port_type pt); +void hsr_del_port(struct hsr_port *port); +bool hsr_port_exists(const struct net_device *dev); + +static inline struct hsr_port *hsr_port_get_rtnl(const struct net_device *dev) +{ + ASSERT_RTNL(); + return hsr_port_exists(dev) ? + rtnl_dereference(dev->rx_handler_data) : NULL; +} + +static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev) +{ + return hsr_port_exists(dev) ? + rcu_dereference(dev->rx_handler_data) : NULL; +} + +#endif /* __HSR_SLAVE_H */ diff --git a/net/ieee802154/6lowpan_rtnl.c b/net/ieee802154/6lowpan_rtnl.c index fe6bd7a71081..016b77ee88f0 100644 --- a/net/ieee802154/6lowpan_rtnl.c +++ b/net/ieee802154/6lowpan_rtnl.c @@ -80,14 +80,14 @@ lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) static inline void lowpan_address_flip(u8 *src, u8 *dest) { int i; + for (i = 0; i < IEEE802154_ADDR_LEN; i++) (dest)[IEEE802154_ADDR_LEN - i - 1] = (src)[i]; } -static int lowpan_header_create(struct sk_buff *skb, - struct net_device *dev, - unsigned short type, const void *_daddr, - const void *_saddr, unsigned int len) +static int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *_daddr, + const void *_saddr, unsigned int len) { const u8 *saddr = _saddr; const u8 *daddr = _daddr; @@ -144,7 +144,7 @@ static int lowpan_header_create(struct sk_buff *skb, } static int lowpan_give_skb_to_devices(struct sk_buff *skb, - struct net_device *dev) + struct net_device *dev) { struct lowpan_dev_record *entry; struct sk_buff *skb_cp; @@ -368,24 +368,28 @@ static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) static struct wpan_phy *lowpan_get_phy(const struct net_device *dev) { struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_phy(real_dev); } static __le16 lowpan_get_pan_id(const struct net_device *dev) { struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_pan_id(real_dev); } static __le16 lowpan_get_short_addr(const struct net_device *dev) { struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev); } static u8 lowpan_get_dsn(const struct net_device *dev) { struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev); } @@ -454,7 +458,7 @@ static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[]) } static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) + struct packet_type *pt, struct net_device *orig_dev) { struct ieee802154_hdr hdr; int ret; diff --git a/net/ieee802154/af_ieee802154.c b/net/ieee802154/af_ieee802154.c index 351d9a94ec2f..29e0de63001b 100644 --- a/net/ieee802154/af_ieee802154.c +++ b/net/ieee802154/af_ieee802154.c @@ -40,9 +40,7 @@ #include "af802154.h" -/* - * Utility function for families - */ +/* Utility function for families */ struct net_device* ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) { @@ -87,8 +85,8 @@ ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) rtnl_unlock(); break; default: - pr_warning("Unsupported ieee802154 address type: %d\n", - addr->mode); + pr_warn("Unsupported ieee802154 address type: %d\n", + addr->mode); break; } @@ -106,7 +104,7 @@ static int ieee802154_sock_release(struct socket *sock) return 0; } static int ieee802154_sock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) + struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; @@ -114,7 +112,7 @@ static int ieee802154_sock_sendmsg(struct kiocb *iocb, struct socket *sock, } static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, - int addr_len) + int addr_len) { struct sock *sk = sock->sk; @@ -125,7 +123,7 @@ static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, } static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) + int addr_len, int flags) { struct sock *sk = sock->sk; @@ -139,7 +137,7 @@ static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, } static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, - unsigned int cmd) + unsigned int cmd) { struct ifreq ifr; int ret = -ENOIOCTLCMD; @@ -167,7 +165,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, } static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, - unsigned long arg) + unsigned long arg) { struct sock *sk = sock->sk; @@ -238,8 +236,7 @@ static const struct proto_ops ieee802154_dgram_ops = { }; -/* - * Create a socket. Initialise the socket, blank the addresses +/* Create a socket. Initialise the socket, blank the addresses * set the state. */ static int ieee802154_create(struct net *net, struct socket *sock, @@ -301,13 +298,14 @@ static const struct net_proto_family ieee802154_family_ops = { }; static int ieee802154_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) + struct packet_type *pt, struct net_device *orig_dev) { if (!netif_running(dev)) goto drop; pr_debug("got frame, type %d, dev %p\n", dev->type, dev); #ifdef DEBUG - print_hex_dump_bytes("ieee802154_rcv ", DUMP_PREFIX_NONE, skb->data, skb->len); + print_hex_dump_bytes("ieee802154_rcv ", + DUMP_PREFIX_NONE, skb->data, skb->len); #endif if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c index 4f0ed8780194..ef2ad8aaef13 100644 --- a/net/ieee802154/dgram.c +++ b/net/ieee802154/dgram.c @@ -149,8 +149,7 @@ static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { - /* - * We will only return the amount + /* We will only return the amount * of this packet since that is all * that will be read. */ @@ -161,12 +160,13 @@ static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) } } + return -ENOIOCTLCMD; } /* FIXME: autobind */ static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, - int len) + int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct dgram_sock *ro = dgram_sk(sk); @@ -205,7 +205,7 @@ static int dgram_disconnect(struct sock *sk, int flags) } static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t size) + struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; @@ -248,8 +248,8 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, - msg->msg_flags & MSG_DONTWAIT, - &err); + msg->msg_flags & MSG_DONTWAIT, + &err); if (!skb) goto out_dev; @@ -262,7 +262,8 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, cb->ackreq = ro->want_ack; if (msg->msg_name) { - DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name); + DECLARE_SOCKADDR(struct sockaddr_ieee802154*, + daddr, msg->msg_name); ieee802154_addr_from_sa(&dst_addr, &daddr->addr); } else { @@ -304,8 +305,8 @@ out: } static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, int noblock, int flags, - int *addr_len) + struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; @@ -398,6 +399,7 @@ int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) dgram_sk(sk))) { if (prev) { struct sk_buff *clone; + clone = skb_clone(skb, GFP_ATOMIC); if (clone) dgram_rcv_skb(prev, clone); @@ -407,9 +409,9 @@ int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) } } - if (prev) + if (prev) { dgram_rcv_skb(prev, skb); - else { + } else { kfree_skb(skb); ret = NET_RX_DROP; } @@ -419,7 +421,7 @@ int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) } static int dgram_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen) { struct dgram_sock *ro = dgram_sk(sk); @@ -463,7 +465,7 @@ static int dgram_getsockopt(struct sock *sk, int level, int optname, } static int dgram_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + char __user *optval, unsigned int optlen) { struct dgram_sock *ro = dgram_sk(sk); struct net *net = sock_net(sk); diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h index 8b83a231299e..5d352f86979e 100644 --- a/net/ieee802154/ieee802154.h +++ b/net/ieee802154/ieee802154.h @@ -43,7 +43,7 @@ struct genl_info; struct sk_buff *ieee802154_nl_create(int flags, u8 req); int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group); struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info, - int flags, u8 req); + int flags, u8 req); int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info); extern struct genl_family nl802154_family; diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c index 26efcf4fd2ff..9222966f5e6d 100644 --- a/net/ieee802154/netlink.c +++ b/net/ieee802154/netlink.c @@ -52,7 +52,7 @@ struct sk_buff *ieee802154_nl_create(int flags, u8 req) spin_lock_irqsave(&ieee802154_seq_lock, f); hdr = genlmsg_put(msg, 0, ieee802154_seq_num++, - &nl802154_family, flags, req); + &nl802154_family, flags, req); spin_unlock_irqrestore(&ieee802154_seq_lock, f); if (!hdr) { nlmsg_free(msg); @@ -86,7 +86,7 @@ struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info, return NULL; hdr = genlmsg_put_reply(msg, info, - &nl802154_family, flags, req); + &nl802154_family, flags, req); if (!hdr) { nlmsg_free(msg); return NULL; diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index a3281b8bfd5b..c6bfe22bfa5e 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -60,7 +60,8 @@ static __le16 nla_get_shortaddr(const struct nlattr *nla) } int ieee802154_nl_assoc_indic(struct net_device *dev, - struct ieee802154_addr *addr, u8 cap) + struct ieee802154_addr *addr, + u8 cap) { struct sk_buff *msg; @@ -93,7 +94,7 @@ nla_put_failure: EXPORT_SYMBOL(ieee802154_nl_assoc_indic); int ieee802154_nl_assoc_confirm(struct net_device *dev, __le16 short_addr, - u8 status) + u8 status) { struct sk_buff *msg; @@ -119,7 +120,8 @@ nla_put_failure: EXPORT_SYMBOL(ieee802154_nl_assoc_confirm); int ieee802154_nl_disassoc_indic(struct net_device *dev, - struct ieee802154_addr *addr, u8 reason) + struct ieee802154_addr *addr, + u8 reason) { struct sk_buff *msg; @@ -205,8 +207,9 @@ nla_put_failure: EXPORT_SYMBOL(ieee802154_nl_beacon_indic); int ieee802154_nl_scan_confirm(struct net_device *dev, - u8 status, u8 scan_type, u32 unscanned, u8 page, - u8 *edl/* , struct list_head *pan_desc_list */) + u8 status, u8 scan_type, + u32 unscanned, u8 page, + u8 *edl/* , struct list_head *pan_desc_list */) { struct sk_buff *msg; @@ -260,7 +263,7 @@ nla_put_failure: EXPORT_SYMBOL(ieee802154_nl_start_confirm); static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, - u32 seq, int flags, struct net_device *dev) + u32 seq, int flags, struct net_device *dev) { void *hdr; struct wpan_phy *phy; @@ -270,7 +273,7 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, pr_debug("%s\n", __func__); hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags, - IEEE802154_LIST_IFACE); + IEEE802154_LIST_IFACE); if (!hdr) goto out; @@ -330,14 +333,16 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info) if (info->attrs[IEEE802154_ATTR_DEV_NAME]) { char name[IFNAMSIZ + 1]; + nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME], - sizeof(name)); + sizeof(name)); dev = dev_get_by_name(&init_net, name); - } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) + } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) { dev = dev_get_by_index(&init_net, nla_get_u32(info->attrs[IEEE802154_ATTR_DEV_INDEX])); - else + } else { return NULL; + } if (!dev) return NULL; @@ -435,7 +440,7 @@ int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info) int ret = -EOPNOTSUPP; if ((!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] && - !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) || + !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) || !info->attrs[IEEE802154_ATTR_REASON]) return -EINVAL; @@ -464,8 +469,7 @@ out: return ret; } -/* - * PANid, channel, beacon_order = 15, superframe_order = 15, +/* PANid, channel, beacon_order = 15, superframe_order = 15, * PAN_coordinator, battery_life_extension = 0, * coord_realignment = 0, security_enable = 0 */ @@ -559,8 +563,8 @@ int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info) page = 0; - ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels, page, - duration); + ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels, + page, duration); out: dev_put(dev); @@ -570,7 +574,8 @@ out: int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info) { /* Request for interface name, index, type, IEEE address, - PAN Id, short address */ + * PAN Id, short address + */ struct sk_buff *msg; struct net_device *dev = NULL; int rc = -ENOBUFS; @@ -586,7 +591,7 @@ int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info) goto out_dev; rc = ieee802154_nl_fill_iface(msg, info->snd_portid, info->snd_seq, - 0, dev); + 0, dev); if (rc < 0) goto out_free; @@ -598,7 +603,6 @@ out_free: out_dev: dev_put(dev); return rc; - } int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb) @@ -616,7 +620,8 @@ int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb) goto cont; if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, dev) < 0) + cb->nlh->nlmsg_seq, + NLM_F_MULTI, dev) < 0) break; cont: idx++; @@ -765,6 +770,7 @@ ieee802154_llsec_parse_key_id(struct genl_info *info, case IEEE802154_SCF_KEY_SHORT_INDEX: { u32 source = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT]); + desc->short_source = cpu_to_le32(source); break; } @@ -842,7 +848,7 @@ int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info) goto out_dev; hdr = genlmsg_put(msg, 0, info->snd_seq, &nl802154_family, 0, - IEEE802154_LLSEC_GETPARAMS); + IEEE802154_LLSEC_GETPARAMS); if (!hdr) goto out_free; @@ -946,7 +952,7 @@ struct llsec_dump_data { static int ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb, - int (*step)(struct llsec_dump_data*)) + int (*step)(struct llsec_dump_data *)) { struct net *net = sock_net(skb->sk); struct net_device *dev; diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 89b265aea151..972baf83411a 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -36,7 +36,7 @@ #include "ieee802154.h" static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, - u32 seq, int flags, struct wpan_phy *phy) + u32 seq, int flags, struct wpan_phy *phy) { void *hdr; int i, pages = 0; @@ -48,7 +48,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, return -EMSGSIZE; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags, - IEEE802154_LIST_PHY); + IEEE802154_LIST_PHY); if (!hdr) goto out; @@ -80,7 +80,8 @@ out: int ieee802154_list_phy(struct sk_buff *skb, struct genl_info *info) { /* Request for interface name, index, type, IEEE address, - PAN Id, short address */ + * PAN Id, short address + */ struct sk_buff *msg; struct wpan_phy *phy; const char *name; @@ -105,7 +106,7 @@ int ieee802154_list_phy(struct sk_buff *skb, struct genl_info *info) goto out_dev; rc = ieee802154_nl_fill_phy(msg, info->snd_portid, info->snd_seq, - 0, phy); + 0, phy); if (rc < 0) goto out_free; @@ -117,7 +118,6 @@ out_free: out_dev: wpan_phy_put(phy); return rc; - } struct dump_phy_data { @@ -137,10 +137,10 @@ static int ieee802154_dump_phy_iter(struct wpan_phy *phy, void *_data) return 0; rc = ieee802154_nl_fill_phy(data->skb, - NETLINK_CB(data->cb->skb).portid, - data->cb->nlh->nlmsg_seq, - NLM_F_MULTI, - phy); + NETLINK_CB(data->cb->skb).portid, + data->cb->nlh->nlmsg_seq, + NLM_F_MULTI, + phy); if (rc < 0) { data->idx--; @@ -238,10 +238,9 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) addr.sa_family = ARPHRD_IEEE802154; nla_memcpy(&addr.sa_data, info->attrs[IEEE802154_ATTR_HW_ADDR], - IEEE802154_ADDR_LEN); + IEEE802154_ADDR_LEN); - /* - * strangely enough, some callbacks (inetdev_event) from + /* strangely enough, some callbacks (inetdev_event) from * dev_set_mac_address require RTNL_LOCK */ rtnl_lock(); diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c index 74d54fae33d7..9d1f64806f02 100644 --- a/net/ieee802154/raw.c +++ b/net/ieee802154/raw.c @@ -96,7 +96,7 @@ out: } static int raw_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) + int addr_len) { return -ENOTSUPP; } @@ -106,8 +106,8 @@ static int raw_disconnect(struct sock *sk, int flags) return 0; } -static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t size) +static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; @@ -145,7 +145,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, - msg->msg_flags & MSG_DONTWAIT, &err); + msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; @@ -235,7 +235,6 @@ void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb) bh_lock_sock(sk); if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) { - struct sk_buff *clone; clone = skb_clone(skb, GFP_ATOMIC); @@ -248,13 +247,13 @@ void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb) } static int raw_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen) { return -EOPNOTSUPP; } static int raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + char __user *optval, unsigned int optlen) { return -EOPNOTSUPP; } @@ -274,4 +273,3 @@ struct proto ieee802154_raw_prot = { .getsockopt = raw_getsockopt, .setsockopt = raw_setsockopt, }; - diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c index 6f1428c4870b..f13d4f32e207 100644 --- a/net/ieee802154/reassembly.c +++ b/net/ieee802154/reassembly.c @@ -50,29 +50,25 @@ static unsigned int lowpan_hash_frag(__be16 tag, u16 d_size, const struct ieee802154_addr *saddr, const struct ieee802154_addr *daddr) { - u32 c; - net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd)); - c = jhash_3words(ieee802154_addr_hash(saddr), - ieee802154_addr_hash(daddr), - (__force u32)(tag + (d_size << 16)), - lowpan_frags.rnd); - - return c & (INETFRAGS_HASHSZ - 1); + return jhash_3words(ieee802154_addr_hash(saddr), + ieee802154_addr_hash(daddr), + (__force u32)(tag + (d_size << 16)), + lowpan_frags.rnd); } -static unsigned int lowpan_hashfn(struct inet_frag_queue *q) +static unsigned int lowpan_hashfn(const struct inet_frag_queue *q) { - struct lowpan_frag_queue *fq; + const struct lowpan_frag_queue *fq; fq = container_of(q, struct lowpan_frag_queue, q); return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr); } -static bool lowpan_frag_match(struct inet_frag_queue *q, void *a) +static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a) { - struct lowpan_frag_queue *fq; - struct lowpan_create_arg *arg = a; + const struct lowpan_frag_queue *fq; + const struct lowpan_create_arg *arg = a; fq = container_of(q, struct lowpan_frag_queue, q); return fq->tag == arg->tag && fq->d_size == arg->d_size && @@ -80,10 +76,10 @@ static bool lowpan_frag_match(struct inet_frag_queue *q, void *a) ieee802154_addr_equal(&fq->daddr, arg->dst); } -static void lowpan_frag_init(struct inet_frag_queue *q, void *a) +static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { + const struct lowpan_create_arg *arg = a; struct lowpan_frag_queue *fq; - struct lowpan_create_arg *arg = a; fq = container_of(q, struct lowpan_frag_queue, q); @@ -128,7 +124,6 @@ fq_find(struct net *net, const struct lowpan_frag_info *frag_info, arg.src = src; arg.dst = dst; - read_lock(&lowpan_frags.lock); hash = lowpan_hash_frag(frag_info->d_tag, frag_info->d_size, src, dst); q = inet_frag_find(&ieee802154_lowpan->frags, @@ -223,7 +218,6 @@ found: return res; } - inet_frag_lru_move(&fq->q); return -1; err: kfree_skb(skb); @@ -373,11 +367,10 @@ int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type) if (frag_info->d_size > ieee802154_lowpan->max_dsize) goto err; - inet_frag_evictor(&ieee802154_lowpan->frags, &lowpan_frags, false); - fq = fq_find(net, frag_info, &source, &dest); if (fq != NULL) { int ret; + spin_lock(&fq->q.lock); ret = lowpan_frag_queue(fq, skb, frag_type); spin_unlock(&fq->q.lock); @@ -393,20 +386,25 @@ err: EXPORT_SYMBOL(lowpan_frag_rcv); #ifdef CONFIG_SYSCTL +static int zero; + static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .data = &init_net.ieee802154_lowpan.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh }, { .procname = "6lowpanfrag_low_thresh", .data = &init_net.ieee802154_lowpan.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh }, { .procname = "6lowpanfrag_time", @@ -425,10 +423,12 @@ static struct ctl_table lowpan_frags_ns_ctl_table[] = { { } }; +/* secret interval has been deprecated */ +static int lowpan_frags_secret_interval_unused; static struct ctl_table lowpan_frags_ctl_table[] = { { .procname = "6lowpanfrag_secret_interval", - .data = &lowpan_frags.secret_interval, + .data = &lowpan_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -451,7 +451,10 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) goto err_alloc; table[0].data = &ieee802154_lowpan->frags.high_thresh; + table[0].extra1 = &ieee802154_lowpan->frags.low_thresh; + table[0].extra2 = &init_net.ieee802154_lowpan.frags.high_thresh; table[1].data = &ieee802154_lowpan->frags.low_thresh; + table[1].extra2 = &ieee802154_lowpan->frags.high_thresh; table[2].data = &ieee802154_lowpan->frags.timeout; table[3].data = &ieee802154_lowpan->max_dsize; @@ -568,7 +571,6 @@ int __init lowpan_net_frag_init(void) lowpan_frags.qsize = sizeof(struct frag_queue); lowpan_frags.match = lowpan_frag_match; lowpan_frags.frag_expire = lowpan_frag_expire; - lowpan_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&lowpan_frags); return ret; diff --git a/net/ieee802154/wpan-class.c b/net/ieee802154/wpan-class.c index 8d6f6704da84..4955e0fe5883 100644 --- a/net/ieee802154/wpan-class.c +++ b/net/ieee802154/wpan-class.c @@ -48,7 +48,8 @@ MASTER_SHOW(transmit_power, "%d +- 1 dB"); MASTER_SHOW(cca_mode, "%d"); static ssize_t channels_supported_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, + char *buf) { struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); int ret; @@ -57,7 +58,7 @@ static ssize_t channels_supported_show(struct device *dev, mutex_lock(&phy->pib_lock); for (i = 0; i < 32; i++) { ret = snprintf(buf + len, PAGE_SIZE - len, - "%#09x\n", phy->channels_supported[i]); + "%#09x\n", phy->channels_supported[i]); if (ret < 0) break; len += ret; @@ -80,6 +81,7 @@ ATTRIBUTE_GROUPS(pmib); static void wpan_phy_release(struct device *d) { struct wpan_phy *phy = container_of(d, struct wpan_phy, dev); + kfree(phy); } @@ -121,11 +123,12 @@ static int wpan_phy_iter(struct device *dev, void *_data) { struct wpan_phy_iter_data *wpid = _data; struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); + return wpid->fn(phy, wpid->data); } int wpan_phy_for_each(int (*fn)(struct wpan_phy *phy, void *data), - void *data) + void *data) { struct wpan_phy_iter_data wpid = { .fn = fn, @@ -197,6 +200,7 @@ EXPORT_SYMBOL(wpan_phy_free); static int __init wpan_phy_class_init(void) { int rc; + rc = class_register(&wpan_phy_class); if (rc) goto err; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 05c57f0fcabe..dbc10d84161f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -307,6 +307,10 @@ config NET_IPVTI the notion of a secure tunnel for IPSEC and then use routing protocol on top. +config NET_UDP_TUNNEL + tristate + default n + config INET_AH tristate "IP: AH transformation" select XFRM_ALGO diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f032688d20d3..8ee1cd4053ee 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_NET_IPIP) += ipip.o gre-y := gre_demux.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o +obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o obj-$(CONFIG_NET_IPVTI) += ip_vti.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d5e6836cf772..d156b3c5f363 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1429,6 +1429,9 @@ static int inet_gro_complete(struct sk_buff *skb, int nhoff) int proto = iph->protocol; int err = -ENOSYS; + if (skb->encapsulation) + skb_set_inner_network_header(skb, nhoff); + csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index a3095fdefbed..90c0e8386116 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -76,6 +76,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; + inet_set_txhash(sk); inet->inet_id = jiffies; sk_dst_set(sk, &rt->dst); diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 4e9619bca732..0485bf7f8f03 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -68,6 +68,7 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, skb_push(skb, hdr_len); + skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = tnl_flags_to_gre_flags(tpi->flags); greh->protocol = tpi->proto; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index eb92deb12666..f0bdd47bbbcb 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -263,6 +263,9 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff) int err = -ENOENT; __be16 type; + skb->encapsulation = 1; + skb_shinfo(skb)->gso_type = SKB_GSO_GRE; + type = greh->protocol; if (greh->flags & GRE_KEY) grehlen += GRE_HEADER_SECTION; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 79c3d947a481..42b7bcf8045b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -739,8 +739,6 @@ static void icmp_unreach(struct sk_buff *skb) /* fall through */ case 0: info = ntohs(icmph->un.frag.mtu); - if (!info) - goto out; } break; case ICMP_SR_FAILED: diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6748d420f714..f10eab462282 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1321,7 +1321,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) atomic_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST - setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); + setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); im->unsolicit_count = IGMP_Unsolicited_Report_Count; #endif @@ -1944,6 +1944,10 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) rtnl_lock(); in_dev = ip_mc_find_dev(net, imr); + if (!in_dev) { + ret = -ENODEV; + goto out; + } ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = rtnl_dereference(*imlp)) != NULL; @@ -1961,16 +1965,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) *imlp = iml->next_rcu; - if (in_dev) - ip_mc_dec_group(in_dev, group); + ip_mc_dec_group(in_dev, group); rtnl_unlock(); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } - if (!in_dev) - ret = -ENODEV; +out: rtnl_unlock(); return ret; } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 3b01959bf4bb..62b1f73749dc 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,6 +25,12 @@ #include <net/inet_frag.h> #include <net/inet_ecn.h> +#define INETFRAGS_EVICT_BUCKETS 128 +#define INETFRAGS_EVICT_MAX 512 + +/* don't rebuild inetfrag table with new secret more often than this */ +#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) + /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -46,24 +52,39 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static void inet_frag_secret_rebuild(unsigned long dummy) +static unsigned int +inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) +{ + return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); +} + +static bool inet_frag_may_rebuild(struct inet_frags *f) +{ + return time_after(jiffies, + f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); +} + +static void inet_frag_secret_rebuild(struct inet_frags *f) { - struct inet_frags *f = (struct inet_frags *)dummy; - unsigned long now = jiffies; int i; - /* Per bucket lock NOT needed here, due to write lock protection */ - write_lock(&f->lock); + write_seqlock_bh(&f->rnd_seqlock); + + if (!inet_frag_may_rebuild(f)) + goto out; get_random_bytes(&f->rnd, sizeof(u32)); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; struct hlist_node *n; hb = &f->hash[i]; + spin_lock(&hb->chain_lock); + hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = f->hashfn(q); + unsigned int hval = inet_frag_hashfn(f, q); if (hval != i) { struct inet_frag_bucket *hb_dest; @@ -72,76 +93,195 @@ static void inet_frag_secret_rebuild(unsigned long dummy) /* Relink to new hash chain. */ hb_dest = &f->hash[hval]; + + /* This is the only place where we take + * another chain_lock while already holding + * one. As this will not run concurrently, + * we cannot deadlock on hb_dest lock below, if its + * already locked it will be released soon since + * other caller cannot be waiting for hb lock + * that we've taken above. + */ + spin_lock_nested(&hb_dest->chain_lock, + SINGLE_DEPTH_NESTING); hlist_add_head(&q->list, &hb_dest->chain); + spin_unlock(&hb_dest->chain_lock); } } + spin_unlock(&hb->chain_lock); } - write_unlock(&f->lock); - mod_timer(&f->secret_timer, now + f->secret_interval); + f->rebuild = false; + f->last_rebuild_jiffies = jiffies; +out: + write_sequnlock_bh(&f->rnd_seqlock); +} + +static bool inet_fragq_should_evict(const struct inet_frag_queue *q) +{ + return q->net->low_thresh == 0 || + frag_mem_limit(q->net) >= q->net->low_thresh; +} + +static unsigned int +inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) +{ + struct inet_frag_queue *fq; + struct hlist_node *n; + unsigned int evicted = 0; + HLIST_HEAD(expired); + +evict_again: + spin_lock(&hb->chain_lock); + + hlist_for_each_entry_safe(fq, n, &hb->chain, list) { + if (!inet_fragq_should_evict(fq)) + continue; + + if (!del_timer(&fq->timer)) { + /* q expiring right now thus increment its refcount so + * it won't be freed under us and wait until the timer + * has finished executing then destroy it + */ + atomic_inc(&fq->refcnt); + spin_unlock(&hb->chain_lock); + del_timer_sync(&fq->timer); + WARN_ON(atomic_read(&fq->refcnt) != 1); + inet_frag_put(fq, f); + goto evict_again; + } + + /* suppress xmit of (icmp) error packet */ + fq->last_in &= ~INET_FRAG_FIRST_IN; + fq->last_in |= INET_FRAG_EVICTED; + hlist_del(&fq->list); + hlist_add_head(&fq->list, &expired); + ++evicted; + } + + spin_unlock(&hb->chain_lock); + + hlist_for_each_entry_safe(fq, n, &expired, list) + f->frag_expire((unsigned long) fq); + + return evicted; +} + +static void inet_frag_worker(struct work_struct *work) +{ + unsigned int budget = INETFRAGS_EVICT_BUCKETS; + unsigned int i, evicted = 0; + struct inet_frags *f; + + f = container_of(work, struct inet_frags, frags_work); + + BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); + + local_bh_disable(); + + for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { + evicted += inet_evict_bucket(f, &f->hash[i]); + i = (i + 1) & (INETFRAGS_HASHSZ - 1); + if (evicted > INETFRAGS_EVICT_MAX) + break; + } + + f->next_bucket = i; + + local_bh_enable(); + + if (f->rebuild && inet_frag_may_rebuild(f)) + inet_frag_secret_rebuild(f); +} + +static void inet_frag_schedule_worker(struct inet_frags *f) +{ + if (unlikely(!work_pending(&f->frags_work))) + schedule_work(&f->frags_work); } void inet_frags_init(struct inet_frags *f) { int i; + INIT_WORK(&f->frags_work, inet_frag_worker); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb = &f->hash[i]; spin_lock_init(&hb->chain_lock); INIT_HLIST_HEAD(&hb->chain); } - rwlock_init(&f->lock); - setup_timer(&f->secret_timer, inet_frag_secret_rebuild, - (unsigned long)f); - f->secret_timer.expires = jiffies + f->secret_interval; - add_timer(&f->secret_timer); + seqlock_init(&f->rnd_seqlock); + f->last_rebuild_jiffies = 0; } EXPORT_SYMBOL(inet_frags_init); void inet_frags_init_net(struct netns_frags *nf) { - nf->nqueues = 0; init_frag_mem_limit(nf); - INIT_LIST_HEAD(&nf->lru_list); - spin_lock_init(&nf->lru_lock); } EXPORT_SYMBOL(inet_frags_init_net); void inet_frags_fini(struct inet_frags *f) { - del_timer(&f->secret_timer); + cancel_work_sync(&f->frags_work); } EXPORT_SYMBOL(inet_frags_fini); void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) { - nf->low_thresh = 0; + unsigned int seq; + int i; + nf->low_thresh = 0; local_bh_disable(); - inet_frag_evictor(nf, f, true); + +evict_again: + seq = read_seqbegin(&f->rnd_seqlock); + + for (i = 0; i < INETFRAGS_HASHSZ ; i++) + inet_evict_bucket(f, &f->hash[i]); + + if (read_seqretry(&f->rnd_seqlock, seq)) + goto evict_again; + local_bh_enable(); percpu_counter_destroy(&nf->mem); } EXPORT_SYMBOL(inet_frags_exit_net); -static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +static struct inet_frag_bucket * +get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) +__acquires(hb->chain_lock) { struct inet_frag_bucket *hb; - unsigned int hash; + unsigned int seq, hash; + + restart: + seq = read_seqbegin(&f->rnd_seqlock); - read_lock(&f->lock); - hash = f->hashfn(fq); + hash = inet_frag_hashfn(f, fq); hb = &f->hash[hash]; spin_lock(&hb->chain_lock); + if (read_seqretry(&f->rnd_seqlock, seq)) { + spin_unlock(&hb->chain_lock); + goto restart; + } + + return hb; +} + +static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +{ + struct inet_frag_bucket *hb; + + hb = get_frag_bucket_locked(fq, f); hlist_del(&fq->list); spin_unlock(&hb->chain_lock); - - read_unlock(&f->lock); - inet_frag_lru_del(fq); } void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) @@ -165,8 +305,7 @@ static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, kfree_skb(skb); } -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, - int *work) +void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) { struct sk_buff *fp; struct netns_frags *nf; @@ -186,86 +325,30 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, fp = xp; } sum = sum_truesize + f->qsize; - if (work) - *work -= sum; sub_frag_mem_limit(q, sum); if (f->destructor) f->destructor(q); kfree(q); - } EXPORT_SYMBOL(inet_frag_destroy); -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) -{ - struct inet_frag_queue *q; - int work, evicted = 0; - - if (!force) { - if (frag_mem_limit(nf) <= nf->high_thresh) - return 0; - } - - work = frag_mem_limit(nf) - nf->low_thresh; - while (work > 0 || force) { - spin_lock(&nf->lru_lock); - - if (list_empty(&nf->lru_list)) { - spin_unlock(&nf->lru_lock); - break; - } - - q = list_first_entry(&nf->lru_list, - struct inet_frag_queue, lru_list); - atomic_inc(&q->refcnt); - /* Remove q from list to avoid several CPUs grabbing it */ - list_del_init(&q->lru_list); - - spin_unlock(&nf->lru_lock); - - spin_lock(&q->lock); - if (!(q->last_in & INET_FRAG_COMPLETE)) - inet_frag_kill(q, f); - spin_unlock(&q->lock); - - if (atomic_dec_and_test(&q->refcnt)) - inet_frag_destroy(q, f, &work); - evicted++; - } - - return evicted; -} -EXPORT_SYMBOL(inet_frag_evictor); - static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, struct inet_frag_queue *qp_in, struct inet_frags *f, void *arg) { - struct inet_frag_bucket *hb; + struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); struct inet_frag_queue *qp; - unsigned int hash; - - read_lock(&f->lock); /* Protects against hash rebuild */ - /* - * While we stayed w/o the lock other CPU could update - * the rnd seed, so we need to re-calculate the hash - * chain. Fortunatelly the qp_in can be used to get one. - */ - hash = f->hashfn(qp_in); - hb = &f->hash[hash]; - spin_lock(&hb->chain_lock); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because - * such entry could be created on other cpu, while we - * released the hash bucket lock. + * such entry could have been created on other cpu before + * we acquired hash bucket lock. */ hlist_for_each_entry(qp, &hb->chain, list) { if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); qp_in->last_in |= INET_FRAG_COMPLETE; inet_frag_put(qp_in, f); return qp; @@ -278,9 +361,8 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, atomic_inc(&qp->refcnt); hlist_add_head(&qp->list, &hb->chain); - inet_frag_lru_add(nf, qp); + spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); return qp; } @@ -290,6 +372,11 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, { struct inet_frag_queue *q; + if (frag_mem_limit(nf) > nf->high_thresh) { + inet_frag_schedule_worker(f); + return NULL; + } + q = kzalloc(f->qsize, GFP_ATOMIC); if (q == NULL) return NULL; @@ -301,7 +388,6 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); - INIT_LIST_HEAD(&q->lru_list); return q; } @@ -320,12 +406,15 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash) - __releases(&f->lock) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; int depth = 0; + if (frag_mem_limit(nf) > nf->low_thresh) + inet_frag_schedule_worker(f); + + hash &= (INETFRAGS_HASHSZ - 1); hb = &f->hash[hash]; spin_lock(&hb->chain_lock); @@ -333,18 +422,22 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, if (q->net == nf && f->match(q, key)) { atomic_inc(&q->refcnt); spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); return q; } depth++; } spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); if (depth <= INETFRAGS_MAXDEPTH) return inet_frag_create(nf, f, key); - else - return ERR_PTR(-ENOBUFS); + + if (inet_frag_may_rebuild(f)) { + if (!f->rebuild) + f->rebuild = true; + inet_frag_schedule_worker(f); + } + + return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index ed32313e307c..634fc31aa243 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -86,11 +86,6 @@ static inline u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; -int ip_frag_nqueues(struct net *net) -{ - return net->ipv4.frags.nqueues; -} - int ip_frag_mem(struct net *net) { return sum_frag_mem_limit(&net->ipv4.frags); @@ -109,21 +104,21 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); return jhash_3words((__force u32)id << 16 | prot, (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); + ip4_frags.rnd); } -static unsigned int ip4_hashfn(struct inet_frag_queue *q) +static unsigned int ip4_hashfn(const struct inet_frag_queue *q) { - struct ipq *ipq; + const struct ipq *ipq; ipq = container_of(q, struct ipq, q); return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); } -static bool ip4_frag_match(struct inet_frag_queue *q, void *a) +static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) { - struct ipq *qp; - struct ip4_create_arg *arg = a; + const struct ipq *qp; + const struct ip4_create_arg *arg = a; qp = container_of(q, struct ipq, q); return qp->id == arg->iph->id && @@ -133,14 +128,14 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a) qp->user == arg->user; } -static void ip4_frag_init(struct inet_frag_queue *q, void *a) +static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, frags); struct net *net = container_of(ipv4, struct net, ipv4); - struct ip4_create_arg *arg = a; + const struct ip4_create_arg *arg = a; qp->protocol = arg->iph->protocol; qp->id = arg->iph->id; @@ -177,18 +172,6 @@ static void ipq_kill(struct ipq *ipq) inet_frag_kill(&ipq->q, &ip4_frags); } -/* Memory limiting on fragments. Evictor trashes the oldest - * fragment queue until we are back under the threshold. - */ -static void ip_evictor(struct net *net) -{ - int evicted; - - evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false); - if (evicted) - IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); -} - /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ @@ -207,7 +190,8 @@ static void ip_expire(unsigned long arg) ipq_kill(qp); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + if (!(qp->q.last_in & INET_FRAG_EVICTED)) + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { @@ -260,7 +244,6 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) arg.iph = iph; arg.user = user; - read_lock(&ip4_frags.lock); hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); @@ -505,7 +488,6 @@ found: } skb_dst_drop(skb); - inet_frag_lru_move(&qp->q); return -EINPROGRESS; err: @@ -655,9 +637,6 @@ int ip_defrag(struct sk_buff *skb, u32 user) net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); - /* Start by cleaning up the memory. */ - ip_evictor(net); - /* Lookup (or create) queue header */ if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { int ret; @@ -721,14 +700,17 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ipv4.frags.high_thresh }, { .procname = "ipfrag_time", @@ -740,10 +722,12 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { { } }; +/* secret interval has been deprecated */ +static int ip4_frags_secret_interval_unused; static struct ctl_table ip4_frags_ctl_table[] = { { .procname = "ipfrag_secret_interval", - .data = &ip4_frags.secret_interval, + .data = &ip4_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -771,7 +755,10 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) goto err_alloc; table[0].data = &net->ipv4.frags.high_thresh; + table[0].extra1 = &net->ipv4.frags.low_thresh; + table[0].extra2 = &init_net.ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; + table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; /* Don't export sysctls to unprivileged users */ @@ -873,6 +860,5 @@ void __init ipfrag_init(void) ip4_frags.qsize = sizeof(struct ipq); ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; - ip4_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip4_frags); } diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 5e7aecea05cd..ad382499bace 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -288,6 +288,10 @@ int ip_options_compile(struct net *net, optptr++; continue; } + if (unlikely(l < 2)) { + pp_ptr = optptr; + goto error; + } optlen = optptr[1]; if (optlen < 2 || optlen > l) { pp_ptr = optptr; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8d3b6b0e9857..b16556836d66 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -962,10 +962,6 @@ alloc_new_skb: sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; - else - /* only the initial fragment is - time stamped */ - cork->tx_flags = 0; } if (skb == NULL) goto error; @@ -976,7 +972,10 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); + + /* only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; /* * Find where to start putting bytes. diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 097b3e7c1e8f..dd8c8c765799 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -73,12 +73,7 @@ static void __tunnel_dst_set(struct ip_tunnel_dst *idst, { struct dst_entry *old_dst; - if (dst) { - if (dst->flags & DST_NOCACHE) - dst = NULL; - else - dst_clone(dst); - } + dst_clone(dst); old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); dst_release(old_dst); } @@ -108,13 +103,14 @@ static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) rcu_read_lock(); dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); + if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + dst = NULL; if (dst) { if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { - rcu_read_unlock(); tunnel_dst_reset(t); - return NULL; + dst_release(dst); + dst = NULL; } - dst_hold(dst); } rcu_read_unlock(); return (struct rtable *)dst; @@ -173,6 +169,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (remote != t->parms.iph.daddr || + t->parms.iph.saddr != 0 || !(t->dev->flags & IFF_UP)) continue; @@ -189,10 +186,11 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { - if ((local != t->parms.iph.saddr && - (local != t->parms.iph.daddr || - !ipv4_is_multicast(local))) || - !(t->dev->flags & IFF_UP)) + if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && + (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) + continue; + + if (!(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) @@ -209,6 +207,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (t->parms.i_key != key || + t->parms.iph.saddr != 0 || + t->parms.iph.daddr != 0 || !(t->dev->flags & IFF_UP)) continue; @@ -305,7 +305,7 @@ static struct net_device *__ip_tunnel_create(struct net *net, } ASSERT_RTNL(); - dev = alloc_netdev(ops->priv_size, name, ops->setup); + dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); if (!dev) { err = -ENOMEM; goto failed; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index b3e86ea7b71b..5bbef4fdcb43 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -143,8 +143,6 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */ __be32 root_server_addr = NONE; /* Address of NFS server */ u8 root_server_path[256] = { 0, }; /* Path to mount as root */ -__be32 ic_dev_xid; /* Device under configuration */ - /* vendor class identifier */ static char vendor_class_identifier[253] __initdata; @@ -654,6 +652,7 @@ static struct packet_type bootp_packet_type __initdata = { .func = ic_bootp_recv, }; +static __be32 ic_dev_xid; /* Device under configuration */ /* * Initialize DHCP/BOOTP extension fields in the request. @@ -1218,10 +1217,10 @@ static int __init ic_dynamic(void) get_random_bytes(&timeout, sizeof(timeout)); timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM); for (;;) { +#ifdef IPCONFIG_BOOTP /* Track the device we are configuring */ ic_dev_xid = d->xid; -#ifdef IPCONFIG_BOOTP if (do_bootp && (d->able & IC_BOOTP)) ic_bootp_send_if(d, jiffies - start_jiffies); #endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 65bcaa789043..c8034587859d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -500,7 +500,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) else sprintf(name, "pimreg%u", mrt->id); - dev = alloc_netdev(0, name, reg_vif_setup); + dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (dev == NULL) return NULL; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a26ce035e3fa..fb173126f03d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,6 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. +config NF_LOG_ARP + tristate "ARP packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + +config NF_LOG_IPV4 + tristate "IPv4 packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + config NF_TABLES_IPV4 depends on NF_TABLES tristate "IPv4 nf_tables support" @@ -159,25 +169,6 @@ config IP_NF_TARGET_SYNPROXY To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_ULOG - tristate "ULOG target support (obsolete)" - default m if NETFILTER_ADVANCED=n - ---help--- - - This option enables the old IPv4-only "ipt_ULOG" implementation - which has been obsoleted by the new "nfnetlink_log" code (see - CONFIG_NETFILTER_NETLINK_LOG). - - This option adds a `ULOG' target, which allows you to create rules in - any iptables table. The packet is passed to a userspace logging - daemon using netlink multicast sockets; unlike the LOG target - which can only be viewed through syslog. - - The appropriate userspace logging daemon (ulogd) may be obtained from - <http://www.netfilter.org/projects/ulogd/index.html> - - To compile it as a module, choose M here. If unsure, say N. - # NAT + specific targets: nf_conntrack config NF_NAT_IPV4 tristate "IPv4 NAT" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 90b82405331e..245db9df3337 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -19,6 +19,10 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o # defrag obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o +# logging +obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o +obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o + # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c deleted file mode 100644 index 9cb993cd224b..000000000000 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ /dev/null @@ -1,498 +0,0 @@ -/* - * netfilter module for userspace packet logging daemons - * - * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2005-2007 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This module accepts two parameters: - * - * nlbufsiz: - * The parameter specifies how big the buffer for each netlink multicast - * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will - * get accumulated in the kernel until they are sent to userspace. It is - * NOT possible to allocate more than 128kB, and it is strongly discouraged, - * because atomically allocating 128kB inside the network rx softirq is not - * reliable. Please also keep in mind that this buffer size is allocated for - * each nlgroup you are using, so the total kernel memory usage increases - * by that factor. - * - * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since - * nlbufsiz is used with alloc_skb, which adds another - * sizeof(struct skb_shared_info). Use NLMSG_GOODSIZE instead. - * - * flushtimeout: - * Specify, after how many hundredths of a second the queue should be - * flushed even if it is not full yet. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/socket.h> -#include <linux/slab.h> -#include <linux/skbuff.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <net/netlink.h> -#include <linux/netdevice.h> -#include <linux/mm.h> -#include <linux/moduleparam.h> -#include <linux/netfilter.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ipt_ULOG.h> -#include <net/netfilter/nf_log.h> -#include <net/netns/generic.h> -#include <net/sock.h> -#include <linux/bitops.h> -#include <asm/unaligned.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); - -#define ULOG_NL_EVENT 111 /* Harald's favorite number */ -#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ - -static unsigned int nlbufsiz = NLMSG_GOODSIZE; -module_param(nlbufsiz, uint, 0400); -MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); - -static unsigned int flushtimeout = 10; -module_param(flushtimeout, uint, 0600); -MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); - -static bool nflog = true; -module_param(nflog, bool, 0400); -MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); - -/* global data structures */ - -typedef struct { - unsigned int qlen; /* number of nlmsgs' in the skb */ - struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ - struct sk_buff *skb; /* the pre-allocated skb */ - struct timer_list timer; /* the timer function */ -} ulog_buff_t; - -static int ulog_net_id __read_mostly; -struct ulog_net { - unsigned int nlgroup[ULOG_MAXNLGROUPS]; - ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; - struct sock *nflognl; - spinlock_t lock; -}; - -static struct ulog_net *ulog_pernet(struct net *net) -{ - return net_generic(net, ulog_net_id); -} - -/* send one ulog_buff_t to userspace */ -static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum) -{ - ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum]; - - pr_debug("ulog_send: timer is deleting\n"); - del_timer(&ub->timer); - - if (!ub->skb) { - pr_debug("ulog_send: nothing to send\n"); - return; - } - - /* last nlmsg needs NLMSG_DONE */ - if (ub->qlen > 1) - ub->lastnlh->nlmsg_type = NLMSG_DONE; - - NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; - pr_debug("throwing %d packets to netlink group %u\n", - ub->qlen, nlgroupnum + 1); - netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1, - GFP_ATOMIC); - - ub->qlen = 0; - ub->skb = NULL; - ub->lastnlh = NULL; -} - - -/* timer function to flush queue in flushtimeout time */ -static void ulog_timer(unsigned long data) -{ - unsigned int groupnum = *((unsigned int *)data); - struct ulog_net *ulog = container_of((void *)data, - struct ulog_net, - nlgroup[groupnum]); - pr_debug("timer function called, calling ulog_send\n"); - - /* lock to protect against somebody modifying our structure - * from ipt_ulog_target at the same time */ - spin_lock_bh(&ulog->lock); - ulog_send(ulog, groupnum); - spin_unlock_bh(&ulog->lock); -} - -static struct sk_buff *ulog_alloc_skb(unsigned int size) -{ - struct sk_buff *skb; - unsigned int n; - - /* alloc skb which should be big enough for a whole - * multipart message. WARNING: has to be <= 131000 - * due to slab allocator restrictions */ - - n = max(size, nlbufsiz); - skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); - if (!skb) { - if (n > size) { - /* try to allocate only as much as we need for - * current packet */ - - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - pr_debug("cannot even allocate %ub\n", size); - } - } - - return skb; -} - -static void ipt_ulog_packet(struct net *net, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct ipt_ulog_info *loginfo, - const char *prefix) -{ - ulog_buff_t *ub; - ulog_packet_msg_t *pm; - size_t size, copy_len; - struct nlmsghdr *nlh; - struct timeval tv; - struct ulog_net *ulog = ulog_pernet(net); - - /* ffs == find first bit set, necessary because userspace - * is already shifting groupnumber, but we need unshifted. - * ffs() returns [1..32], we need [0..31] */ - unsigned int groupnum = ffs(loginfo->nl_group) - 1; - - /* calculate the size of the skb needed */ - if (loginfo->copy_range == 0 || loginfo->copy_range > skb->len) - copy_len = skb->len; - else - copy_len = loginfo->copy_range; - - size = nlmsg_total_size(sizeof(*pm) + copy_len); - - ub = &ulog->ulog_buffers[groupnum]; - - spin_lock_bh(&ulog->lock); - - if (!ub->skb) { - if (!(ub->skb = ulog_alloc_skb(size))) - goto alloc_failure; - } else if (ub->qlen >= loginfo->qthreshold || - size > skb_tailroom(ub->skb)) { - /* either the queue len is too high or we don't have - * enough room in nlskb left. send it to userspace. */ - - ulog_send(ulog, groupnum); - - if (!(ub->skb = ulog_alloc_skb(size))) - goto alloc_failure; - } - - pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); - - nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, - sizeof(*pm)+copy_len, 0); - if (!nlh) { - pr_debug("error during nlmsg_put\n"); - goto out_unlock; - } - ub->qlen++; - - pm = nlmsg_data(nlh); - memset(pm, 0, sizeof(*pm)); - - /* We might not have a timestamp, get one */ - if (skb->tstamp.tv64 == 0) - __net_timestamp((struct sk_buff *)skb); - - /* copy hook, prefix, timestamp, payload, etc. */ - pm->data_len = copy_len; - tv = ktime_to_timeval(skb->tstamp); - put_unaligned(tv.tv_sec, &pm->timestamp_sec); - put_unaligned(tv.tv_usec, &pm->timestamp_usec); - put_unaligned(skb->mark, &pm->mark); - pm->hook = hooknum; - if (prefix != NULL) { - strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1); - pm->prefix[sizeof(pm->prefix) - 1] = '\0'; - } - else if (loginfo->prefix[0] != '\0') - strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); - - if (in && in->hard_header_len > 0 && - skb->mac_header != skb->network_header && - in->hard_header_len <= ULOG_MAC_LEN) { - memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); - pm->mac_len = in->hard_header_len; - } else - pm->mac_len = 0; - - if (in) - strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); - - if (out) - strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); - - /* copy_len <= skb->len, so can't fail. */ - if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) - BUG(); - - /* check if we are building multi-part messages */ - if (ub->qlen > 1) - ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; - - ub->lastnlh = nlh; - - /* if timer isn't already running, start it */ - if (!timer_pending(&ub->timer)) { - ub->timer.expires = jiffies + flushtimeout * HZ / 100; - add_timer(&ub->timer); - } - - /* if threshold is reached, send message to userspace */ - if (ub->qlen >= loginfo->qthreshold) { - if (loginfo->qthreshold > 1) - nlh->nlmsg_type = NLMSG_DONE; - ulog_send(ulog, groupnum); - } -out_unlock: - spin_unlock_bh(&ulog->lock); - - return; - -alloc_failure: - pr_debug("Error building netlink message\n"); - spin_unlock_bh(&ulog->lock); -} - -static unsigned int -ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - - ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out, - par->targinfo, NULL); - return XT_CONTINUE; -} - -static void ipt_logfn(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *li, - const char *prefix) -{ - struct ipt_ulog_info loginfo; - - if (!li || li->type != NF_LOG_TYPE_ULOG) { - loginfo.nl_group = ULOG_DEFAULT_NLGROUP; - loginfo.copy_range = 0; - loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD; - loginfo.prefix[0] = '\0'; - } else { - loginfo.nl_group = li->u.ulog.group; - loginfo.copy_range = li->u.ulog.copy_len; - loginfo.qthreshold = li->u.ulog.qthreshold; - strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); - } - - ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); -} - -static int ulog_tg_check(const struct xt_tgchk_param *par) -{ - const struct ipt_ulog_info *loginfo = par->targinfo; - - if (!par->net->xt.ulog_warn_deprecated) { - pr_info("ULOG is deprecated and it will be removed soon, " - "use NFLOG instead\n"); - par->net->xt.ulog_warn_deprecated = true; - } - - if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { - pr_debug("prefix not null-terminated\n"); - return -EINVAL; - } - if (loginfo->qthreshold > ULOG_MAX_QLEN) { - pr_debug("queue threshold %Zu > MAX_QLEN\n", - loginfo->qthreshold); - return -EINVAL; - } - return 0; -} - -#ifdef CONFIG_COMPAT -struct compat_ipt_ulog_info { - compat_uint_t nl_group; - compat_size_t copy_range; - compat_size_t qthreshold; - char prefix[ULOG_PREFIX_LEN]; -}; - -static void ulog_tg_compat_from_user(void *dst, const void *src) -{ - const struct compat_ipt_ulog_info *cl = src; - struct ipt_ulog_info l = { - .nl_group = cl->nl_group, - .copy_range = cl->copy_range, - .qthreshold = cl->qthreshold, - }; - - memcpy(l.prefix, cl->prefix, sizeof(l.prefix)); - memcpy(dst, &l, sizeof(l)); -} - -static int ulog_tg_compat_to_user(void __user *dst, const void *src) -{ - const struct ipt_ulog_info *l = src; - struct compat_ipt_ulog_info cl = { - .nl_group = l->nl_group, - .copy_range = l->copy_range, - .qthreshold = l->qthreshold, - }; - - memcpy(cl.prefix, l->prefix, sizeof(cl.prefix)); - return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0; -} -#endif /* CONFIG_COMPAT */ - -static struct xt_target ulog_tg_reg __read_mostly = { - .name = "ULOG", - .family = NFPROTO_IPV4, - .target = ulog_tg, - .targetsize = sizeof(struct ipt_ulog_info), - .checkentry = ulog_tg_check, -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_ipt_ulog_info), - .compat_from_user = ulog_tg_compat_from_user, - .compat_to_user = ulog_tg_compat_to_user, -#endif - .me = THIS_MODULE, -}; - -static struct nf_logger ipt_ulog_logger __read_mostly = { - .name = "ipt_ULOG", - .logfn = ipt_logfn, - .me = THIS_MODULE, -}; - -static int __net_init ulog_tg_net_init(struct net *net) -{ - int i; - struct ulog_net *ulog = ulog_pernet(net); - struct netlink_kernel_cfg cfg = { - .groups = ULOG_MAXNLGROUPS, - }; - - spin_lock_init(&ulog->lock); - /* initialize ulog_buffers */ - for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - ulog->nlgroup[i] = i; - setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer, - (unsigned long)&ulog->nlgroup[i]); - } - - ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg); - if (!ulog->nflognl) - return -ENOMEM; - - if (nflog) - nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger); - - return 0; -} - -static void __net_exit ulog_tg_net_exit(struct net *net) -{ - ulog_buff_t *ub; - int i; - struct ulog_net *ulog = ulog_pernet(net); - - if (nflog) - nf_log_unset(net, &ipt_ulog_logger); - - netlink_kernel_release(ulog->nflognl); - - /* remove pending timers and free allocated skb's */ - for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - ub = &ulog->ulog_buffers[i]; - pr_debug("timer is deleting\n"); - del_timer(&ub->timer); - - if (ub->skb) { - kfree_skb(ub->skb); - ub->skb = NULL; - } - } -} - -static struct pernet_operations ulog_tg_net_ops = { - .init = ulog_tg_net_init, - .exit = ulog_tg_net_exit, - .id = &ulog_net_id, - .size = sizeof(struct ulog_net), -}; - -static int __init ulog_tg_init(void) -{ - int ret; - pr_debug("init module\n"); - - if (nlbufsiz > 128*1024) { - pr_warn("Netlink buffer has to be <= 128kB\n"); - return -EINVAL; - } - - ret = register_pernet_subsys(&ulog_tg_net_ops); - if (ret) - goto out_pernet; - - ret = xt_register_target(&ulog_tg_reg); - if (ret < 0) - goto out_target; - - if (nflog) - nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); - - return 0; - -out_target: - unregister_pernet_subsys(&ulog_tg_net_ops); -out_pernet: - return ret; -} - -static void __exit ulog_tg_exit(void) -{ - pr_debug("cleanup_module\n"); - if (nflog) - nf_log_unregister(&ipt_ulog_logger); - xt_unregister_target(&ulog_tg_reg); - unregister_pernet_subsys(&ulog_tg_net_ops); -} - -module_init(ulog_tg_init); -module_exit(ulog_tg_exit); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 8127dc802865..4ce44c4bc57b 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -314,7 +314,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -ENOENT; } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> @@ -388,7 +388,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .invert_tuple = ipv4_invert_tuple, .print_tuple = ipv4_print_tuple, .get_l4proto = ipv4_get_l4proto, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = ipv4_tuple_to_nlattr, .nlattr_tuple_size = ipv4_nlattr_tuple_size, .nlattr_to_tuple = ipv4_nlattr_to_tuple, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index a338dad41b7d..b91b2641adda 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -226,7 +226,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, return icmp_error_message(net, tmpl, skb, ctinfo, hooknum); } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> @@ -408,7 +408,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = .error = icmp_error, .destroy = NULL, .me = NULL, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, .nlattr_to_tuple = icmp_nlattr_to_tuple, diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index b8f6381c7d0b..76bd1aef257f 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -17,7 +17,7 @@ #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #include <net/netfilter/nf_conntrack_zones.h> @@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, { u16 zone = NF_CT_DEFAULT_ZONE; -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb->nfct) zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif @@ -74,8 +74,8 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, inet->nodefrag) return NF_ACCEPT; -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) -#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#if !IS_ENABLED(CONFIG_NF_NAT) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c new file mode 100644 index 000000000000..ccfc78db12ee --- /dev/null +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -0,0 +1,149 @@ +/* + * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * Based on code from ebt_log from: + * + * Bart De Schuymer <bdschuym@pandora.be> + * Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/xt_LOG.h> +#include <net/netfilter/nf_log.h> + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +struct arppayload { + unsigned char mac_src[ETH_ALEN]; + unsigned char ip_src[4]; + unsigned char mac_dst[ETH_ALEN]; + unsigned char ip_dst[4]; +}; + +static void dump_arp_packet(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int nhoff) +{ + const struct arphdr *ah; + struct arphdr _arph; + const struct arppayload *ap; + struct arppayload _arpp; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", + ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); + + /* If it's for Ethernet and the lengths are OK, then log the ARP + * payload. + */ + if (ah->ar_hrd != htons(1) || + ah->ar_hln != ETH_ALEN || + ah->ar_pln != sizeof(__be32)) + return; + + ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp); + if (ap == NULL) { + nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]", + skb->len - sizeof(_arph)); + return; + } + nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4", + ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); +} + +void nf_log_arp_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, + prefix); + dump_arp_packet(m, loginfo, skb, 0); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_arp_logger __read_mostly = { + .name = "nf_log_arp", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_arp_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_arp_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); + return 0; +} + +static void __net_exit nf_log_arp_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_arp_logger); +} + +static struct pernet_operations nf_log_arp_net_ops = { + .init = nf_log_arp_net_init, + .exit = nf_log_arp_net_exit, +}; + +static int __init nf_log_arp_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_log_arp_net_ops); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_ARP, &nf_arp_logger); + return 0; +} + +static void __exit nf_log_arp_exit(void) +{ + unregister_pernet_subsys(&nf_log_arp_net_ops); + nf_log_unregister(&nf_arp_logger); +} + +module_init(nf_log_arp_init); +module_exit(nf_log_arp_exit); + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter ARP packet logging"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NF_LOGGER(3, 0); diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c new file mode 100644 index 000000000000..078bdca1b607 --- /dev/null +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -0,0 +1,385 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/xt_LOG.h> +#include <net/netfilter/nf_log.h> + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +/* One level of recursion won't kill us */ +static void dump_ipv4_packet(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int iphoff) +{ + struct iphdr _iph; + const struct iphdr *ih; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ + nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + nf_log_buf_add(m, "CE "); + if (ntohs(ih->frag_off) & IP_DF) + nf_log_buf_add(m, "DF "); + if (ntohs(ih->frag_off) & IP_MF) + nf_log_buf_add(m, "MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((logflags & XT_LOG_IPOPT) && + ih->ihl * 4 > sizeof(struct iphdr)) { + const unsigned char *op; + unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + nf_log_buf_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + nf_log_buf_add(m, "%02X", op[i]); + nf_log_buf_add(m, ") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: + if (nf_log_dump_tcp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (nf_log_dump_udp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4)) + return; + break; + case IPPROTO_ICMP: { + struct icmphdr _icmph; + const struct icmphdr *ich; + static const size_t required_len[NR_ICMP_TYPES+1] + = { [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] + = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] + = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] + = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] + = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] + = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ + nf_log_buf_add(m, "PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES && + required_len[ich->type] && + skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + nf_log_buf_add(m, "ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + nf_log_buf_add(m, "PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); + /* Fall through */ + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + nf_log_buf_add(m, "["); + dump_ipv4_packet(m, info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); + nf_log_buf_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH && + ich->code == ICMP_FRAG_NEEDED) { + nf_log_buf_add(m, "MTU=%u ", + ntohs(ich->un.frag.mtu)); + } + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + nf_log_buf_add(m, "PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 10 "PROTO=ESP " */ + nf_log_buf_add(m, "PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + nf_log_buf_add(m, "PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && !iphoff) + nf_log_dump_sk_uid_gid(m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!iphoff && skb->mark) + nf_log_buf_add(m, "MARK=0x%x ", skb->mark); + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* UDP: 10+max(25,20) = 35 */ + /* UDPLITE: 14+max(25,20) = 39 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 252 = 803 */ +} + +static void dump_ipv4_mac_header(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + nf_log_buf_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int i; + + nf_log_buf_add(m, "%02x", *p++); + for (i = 1; i < dev->hard_header_len; i++, p++) + nf_log_buf_add(m, ":%02x", *p); + } + nf_log_buf_add(m, " "); +} + +static void nf_log_ip_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, + out, loginfo, prefix); + + if (in != NULL) + dump_ipv4_mac_header(m, loginfo, skb); + + dump_ipv4_packet(m, loginfo, skb, 0); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_ip_logger __read_mostly = { + .name = "nf_log_ipv4", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_ip_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_ipv4_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); + return 0; +} + +static void __net_exit nf_log_ipv4_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_ip_logger); +} + +static struct pernet_operations nf_log_ipv4_net_ops = { + .init = nf_log_ipv4_net_init, + .exit = nf_log_ipv4_net_exit, +}; + +static int __init nf_log_ipv4_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_log_ipv4_net_ops); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_IPV4, &nf_ip_logger); + return 0; +} + +static void __exit nf_log_ipv4_exit(void) +{ + unregister_pernet_subsys(&nf_log_ipv4_net_ops); + nf_log_unregister(&nf_ip_logger); +} + +module_init(nf_log_ipv4_init); +module_exit(nf_log_ipv4_exit); + +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter IPv4 packet logging"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NF_LOGGER(AF_INET, 0); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index d8b2e14efddc..14f5ccd06337 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -154,6 +154,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, htons(oldlen), htons(datalen), 1); } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) { @@ -169,6 +170,7 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], return 0; } +#endif static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { .l3proto = NFPROTO_IPV4, @@ -177,7 +179,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { .manip_pkt = nf_nat_ipv4_manip_pkt, .csum_update = nf_nat_ipv4_csum_update, .csum_recalc = nf_nat_ipv4_csum_recalc, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_ipv4_nlattr_to_range, +#endif #ifdef CONFIG_XFRM .decode_session = nf_nat_ipv4_decode_session, #endif diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 690d890111bb..9414923f1e15 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -124,7 +124,7 @@ static const struct nf_nat_l4proto gre = { .manip_pkt = gre_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = gre_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index eb303471bcf6..4557b4ab8342 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -77,7 +77,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_icmp = { .manip_pkt = icmp_manip_pkt, .in_range = icmp_in_range, .unique_tuple = icmp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ae0af9386f7c..8e3eb39f84e7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -52,6 +52,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; + unsigned int frag_mem; int orphans, sockets; local_bh_disable(); @@ -71,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - seq_printf(seq, "FRAG: inuse %d memory %d\n", - ip_frag_nqueues(net), ip_frag_mem(net)); + frag_mem = ip_frag_mem(net); + seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem); return 0; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 2c65160565e1..739db3100c23 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -58,6 +58,7 @@ #include <linux/in_route.h> #include <linux/route.h> #include <linux/skbuff.h> +#include <linux/igmp.h> #include <net/net_namespace.h> #include <net/dst.h> #include <net/sock.h> @@ -174,7 +175,9 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) while (sk) { delivered = 1; - if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { + if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && + ip_mc_sf_allow(sk, iph->daddr, iph->saddr, + skb->dev->ifindex)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ @@ -365,6 +368,8 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->ip_summed = CHECKSUM_NONE; + sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + skb->transport_header = skb->network_header; err = -EFAULT; if (memcpy_fromiovecend((void *)iph, from, 0, length)) @@ -606,6 +611,8 @@ back_from_confirm: &rt, msg->msg_flags); else { + sock_tx_timestamp(sk, &ipc.tx_flags); + if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 082239ffe34a..3162ea923ded 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1010,7 +1010,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) const struct iphdr *iph = (const struct iphdr *) skb->data; struct flowi4 fl4; struct rtable *rt; - struct dst_entry *dst; + struct dst_entry *odst = NULL; bool new = false; bh_lock_sock(sk); @@ -1018,16 +1018,17 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) if (!ip_sk_accept_pmtu(sk)) goto out; - rt = (struct rtable *) __sk_dst_get(sk); + odst = sk_dst_get(sk); - if (sock_owned_by_user(sk) || !rt) { + if (sock_owned_by_user(sk) || !odst) { __ipv4_sk_update_pmtu(skb, sk, mtu); goto out; } __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); - if (!__sk_dst_check(sk, 0)) { + rt = (struct rtable *)odst; + if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; @@ -1037,8 +1038,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); - dst = dst_check(&rt->dst, 0); - if (!dst) { + if (!dst_check(&rt->dst, 0)) { if (new) dst_release(&rt->dst); @@ -1050,10 +1050,11 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) } if (new) - __sk_dst_set(sk, &rt->dst); + sk_dst_set(sk, &rt->dst); out: bh_unlock_sock(sk); + dst_release(odst); } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c86624b36a62..c0c75688896e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); -__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, + __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index eb1dde37e678..9d2118e5fbc7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1108,7 +1108,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); - goto out; + goto out_nopush; } err = -EINVAL; @@ -1282,6 +1282,7 @@ wait_for_memory: out: if (copied) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); +out_nopush: release_sock(sk); return copied + copied_syn; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 62e48cf84e60..9771563ab564 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -131,7 +131,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, struct dst_entry *dst, struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 40661fc1e233..7832d941dbcd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1106,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && tp->undo_marker && tp->undo_retrans && + if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; @@ -1162,7 +1162,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, unsigned int new_len = (pkt_len / mss) * mss; if (!in_sack && new_len < pkt_len) { new_len += mss; - if (new_len > skb->len) + if (new_len >= skb->len) return 0; } pkt_len = new_len; @@ -1187,7 +1187,7 @@ static u8 tcp_sacktag_one(struct sock *sk, /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { - if (tp->undo_marker && tp->undo_retrans && + if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) @@ -1893,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp) tp->lost_out = 0; tp->undo_marker = 0; - tp->undo_retrans = 0; + tp->undo_retrans = -1; } void tcp_clear_retrans(struct tcp_sock *tp) @@ -2475,7 +2475,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) * losses and/or application stalls), do not perform any further cwnd * reductions, but instead slow start up to ssthresh. */ -static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) +static void tcp_init_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2485,8 +2485,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) tp->prior_cwnd = tp->snd_cwnd; tp->prr_delivered = 0; tp->prr_out = 0; - if (set_ssthresh) - tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); TCP_ECN_queue_cwr(tp); } @@ -2528,14 +2527,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) } /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ -void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) +void tcp_enter_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tp->prior_ssthresh = 0; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; - tcp_init_cwnd_reduction(sk, set_ssthresh); + tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); } } @@ -2564,7 +2563,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) tp->retrans_stamp = 0; if (flag & FLAG_ECE) - tcp_enter_cwr(sk, 1); + tcp_enter_cwr(sk); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); @@ -2665,12 +2664,12 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) tp->prior_ssthresh = 0; tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out; + tp->undo_retrans = tp->retrans_out ? : -1; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); - tcp_init_cwnd_reduction(sk, true); + tcp_init_cwnd_reduction(sk); } tcp_set_ca_state(sk, TCP_CA_Recovery); } @@ -3346,7 +3345,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) tp->tlp_high_seq = 0; /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ if (!(flag & FLAG_DSACKING_ACK)) { - tcp_init_cwnd_reduction(sk, true); + tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); tcp_try_keep_open(sk); @@ -5877,3 +5876,153 @@ discard: return 0; } EXPORT_SYMBOL(tcp_rcv_state_process); + +static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + if (family == AF_INET) + LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), + &ireq->ir_rmt_addr, port); +#if IS_ENABLED(CONFIG_IPV6) + else if (family == AF_INET6) + LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"), + &ireq->ir_v6_rmt_addr, port); +#endif +} + +int tcp_conn_request(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct sk_buff *skb) +{ + struct tcp_options_received tmp_opt; + struct request_sock *req; + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = NULL; + __u32 isn = TCP_SKB_CB(skb)->when; + bool want_cookie = false, fastopen; + struct flowi fl; + struct tcp_fastopen_cookie foc = { .len = -1 }; + int err; + + + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if ((sysctl_tcp_syncookies == 2 || + inet_csk_reqsk_queue_is_full(sk)) && !isn) { + want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); + if (!want_cookie) + goto drop; + } + + + /* Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + goto drop; + } + + req = inet_reqsk_alloc(rsk_ops); + if (!req) + goto drop; + + tcp_rsk(req)->af_specific = af_ops; + + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = af_ops->mss_clamp; + tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); + + if (want_cookie && !tmp_opt.saw_tstamp) + tcp_clear_options(&tmp_opt); + + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + tcp_openreq_init(req, &tmp_opt, skb, sk); + + af_ops->init_req(req, sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; + + if (!want_cookie || tmp_opt.tstamp_ok) + TCP_ECN_create_request(req, skb, sock_net(sk)); + + if (want_cookie) { + isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + } else if (!isn) { + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) { + bool strict; + + dst = af_ops->route_req(sk, &fl, req, &strict); + if (dst && strict && + !tcp_peer_is_proven(req, dst, true)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); + goto drop_and_release; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (sysctl_max_syn_backlog >> 2)) && + !tcp_peer_is_proven(req, dst, false)) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + pr_drop_req(req, ntohs(tcp_hdr(skb)->source), + rsk_ops->family); + goto drop_and_release; + } + + isn = af_ops->init_seq(skb); + } + if (!dst) { + dst = af_ops->route_req(sk, &fl, req, NULL); + if (!dst) + goto drop_and_free; + } + + tcp_rsk(req)->snt_isn = isn; + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = af_ops->send_synack(sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { + if (err || want_cookie) + goto drop_and_free; + + tcp_rsk(req)->listener = NULL; + af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } + + return 0; + +drop_and_release: + dst_release(dst); +drop_and_free: + reqsk_free(req); +drop: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + return 0; +} +EXPORT_SYMBOL(tcp_conn_request); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 77cccda1ad0c..1edc739b9da5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -99,7 +99,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) +static __u32 tcp_v4_init_sequence(const struct sk_buff *skb) { return secure_tcp_sequence_number(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, @@ -208,6 +208,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; + inet_set_txhash(sk); + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; @@ -814,6 +816,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * socket. */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct flowi *fl, struct request_sock *req, u16 queue_mapping, struct tcp_fastopen_cookie *foc) @@ -837,24 +840,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, ireq->ir_rmt_addr, ireq->opt); err = net_xmit_eval(err); - if (!tcp_rsk(req)->snt_synack && !err) - tcp_rsk(req)->snt_synack = tcp_time_stamp; } return err; } -static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) -{ - int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL); - - if (!res) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - } - return res; -} - /* * IPv4 request_sock destructor. */ @@ -1237,161 +1227,68 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) #endif +static void tcp_v4_init_req(struct request_sock *req, struct sock *sk, + struct sk_buff *skb) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + ireq->ir_loc_addr = ip_hdr(skb)->daddr; + ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + ireq->no_srccheck = inet_sk(sk)->transparent; + ireq->opt = tcp_v4_save_options(skb); +} + +static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, + const struct request_sock *req, + bool *strict) +{ + struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req); + + if (strict) { + if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr) + *strict = true; + else + *strict = false; + } + + return dst; +} + struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), - .rtx_syn_ack = tcp_v4_rtx_synack, + .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, .syn_ack_timeout = tcp_syn_ack_timeout, }; -#ifdef CONFIG_TCP_MD5SIG static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { + .mss_clamp = TCP_MSS_DEFAULT, +#ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_reqsk_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, -}; #endif + .init_req = tcp_v4_init_req, +#ifdef CONFIG_SYN_COOKIES + .cookie_init_seq = cookie_v4_init_sequence, +#endif + .route_req = tcp_v4_route_req, + .init_seq = tcp_v4_init_sequence, + .send_synack = tcp_v4_send_synack, + .queue_hash_add = inet_csk_reqsk_queue_hash_add, +}; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp_options_received tmp_opt; - struct request_sock *req; - struct inet_request_sock *ireq; - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = NULL; - __be32 saddr = ip_hdr(skb)->saddr; - __be32 daddr = ip_hdr(skb)->daddr; - __u32 isn = TCP_SKB_CB(skb)->when; - bool want_cookie = false, fastopen; - struct flowi4 fl4; - struct tcp_fastopen_cookie foc = { .len = -1 }; - int err; - /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - if ((sysctl_tcp_syncookies == 2 || - inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); - if (!want_cookie) - goto drop; - } - - /* Accept backlog is full. If we have already queued enough - * of warm entries in syn queue, drop request. It is better than - * clogging syn queue with openreqs with exponentially increasing - * timeout. - */ - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); - goto drop; - } - - req = inet_reqsk_alloc(&tcp_request_sock_ops); - if (!req) - goto drop; - -#ifdef CONFIG_TCP_MD5SIG - tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; -#endif - - tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = TCP_MSS_DEFAULT; - tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); - - if (want_cookie && !tmp_opt.saw_tstamp) - tcp_clear_options(&tmp_opt); - - tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb); - - ireq = inet_rsk(req); - ireq->ir_loc_addr = daddr; - ireq->ir_rmt_addr = saddr; - ireq->no_srccheck = inet_sk(sk)->transparent; - ireq->opt = tcp_v4_save_options(skb); - ireq->ir_mark = inet_request_mark(sk, skb); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sock_net(sk)); - - if (want_cookie) { - isn = cookie_v4_init_sequence(sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - } else if (!isn) { - /* VJ's idea. We save last timestamp seen - * from the destination in peer table, when entering - * state TIME-WAIT, and check against it before - * accepting new connection request. - * - * If "isn" is not zero, this request hit alive - * timewait bucket, so that all the necessary checks - * are made in the function processing timewait state. - */ - if (tmp_opt.saw_tstamp && - tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && - fl4.daddr == saddr) { - if (!tcp_peer_is_proven(req, dst, true)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); - goto drop_and_release; - } - } - /* Kill the following clause, if you dislike this way. */ - else if (!sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst, false)) { - /* Without syncookies last quarter of - * backlog is filled with destinations, - * proven to be alive. - * It means that we continue to communicate - * to destinations, already remembered - * to the moment of synflood. - */ - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), - &saddr, ntohs(tcp_hdr(skb)->source)); - goto drop_and_release; - } + return tcp_conn_request(&tcp_request_sock_ops, + &tcp_request_sock_ipv4_ops, sk, skb); - isn = tcp_v4_init_sequence(skb); - } - if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) - goto drop_and_free; - - tcp_rsk(req)->snt_isn = isn; - tcp_rsk(req)->snt_synack = tcp_time_stamp; - tcp_openreq_init_rwin(req, sk, dst); - fastopen = !want_cookie && - tcp_try_fastopen(sk, skb, req, &foc, dst); - err = tcp_v4_send_synack(sk, dst, req, - skb_get_queue_mapping(skb), &foc); - if (!fastopen) { - if (err || want_cookie) - goto drop_and_free; - - tcp_rsk(req)->snt_synack = tcp_time_stamp; - tcp_rsk(req)->listener = NULL; - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - } - - return 0; - -drop_and_release: - dst_release(dst); -drop_and_free: - reqsk_free(req); drop: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return 0; @@ -1439,6 +1336,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; inet_csk(newsk)->icsk_ext_hdr_len = 0; + inet_set_txhash(newsk); if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e68e0d4af6c9..1649988bd1b6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -298,7 +298,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = np->flow_label >> 12; - tw->tw_ipv6only = np->ipv6only; + tw->tw_ipv6only = sk->sk_ipv6only; } #endif diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 4e86c59ec7f7..55046ecd083e 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -309,7 +309,7 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff) th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; return tcp_gro_complete(skb); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d92bce0ea24e..8fcfc91964ec 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -916,6 +916,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_orphan(skb); skb->sk = sk; skb->destructor = tcp_wfree; + skb_set_hash_from_sk(skb, sk); atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ @@ -978,7 +979,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (likely(err <= 0)) return err; - tcp_enter_cwr(sk, 1); + tcp_enter_cwr(sk); return net_xmit_eval(err); } @@ -2525,8 +2526,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = TCP_SKB_CB(skb)->when; - tp->undo_retrans += tcp_skb_pcount(skb); - /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). */ @@ -2534,6 +2533,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } else if (err != -EBUSY) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } + + if (tp->undo_retrans < 0) + tp->undo_retrans = 0; + tp->undo_retrans += tcp_skb_pcount(skb); return err; } @@ -3299,3 +3302,18 @@ void tcp_send_probe0(struct sock *sk) TCP_RTO_MAX); } } + +int tcp_rtx_synack(struct sock *sk, struct request_sock *req) +{ + const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; + struct flowi fl; + int res; + + res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); + if (!res) { + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + } + return res; +} +EXPORT_SYMBOL(tcp_rtx_synack); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d92f94b7e402..f57c0e4c2326 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -594,27 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, return true; } -static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, - __be16 loc_port, __be32 loc_addr, - __be16 rmt_port, __be32 rmt_addr, - int dif) -{ - struct hlist_nulls_node *node; - struct sock *s = sk; - unsigned short hnum = ntohs(loc_port); - - sk_nulls_for_each_from(s, node) { - if (__udp_is_mcast_sock(net, s, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum)) - goto found; - } - s = NULL; -found: - return s; -} - /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -1588,8 +1567,11 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto csum_error; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); goto drop; + } rc = 0; @@ -1637,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count, if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) skb1 = NULL; + + sock_put(sk); } if (unlikely(skb1)) kfree_skb(skb1); @@ -1665,41 +1649,50 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udp_table *udptable) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; - struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); - int dif; - unsigned int i, count = 0; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(uh->dest); + struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); + int dif = skb->dev->ifindex; + unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + + if (use_hash2) { + hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & + udp_table.mask; + hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask; +start_lookup: + hslot = &udp_table.hash2[hash2]; + offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); + } spin_lock(&hslot->lock); - sk = sk_nulls_head(&hslot->head); - dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - while (sk) { - stack[count++] = sk; - sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, - daddr, uh->source, saddr, dif); - if (unlikely(count == ARRAY_SIZE(stack))) { - if (!sk) - break; - flush_stack(stack, count, skb, ~0); - count = 0; + sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { + if (__udp_is_mcast_sock(net, sk, + uh->dest, daddr, + uh->source, saddr, + dif, hnum)) { + if (unlikely(count == ARRAY_SIZE(stack))) { + flush_stack(stack, count, skb, ~0); + count = 0; + } + stack[count++] = sk; + sock_hold(sk); } } - /* - * before releasing chain lock, we must take a reference on sockets - */ - for (i = 0; i < count; i++) - sock_hold(stack[i]); spin_unlock(&hslot->lock); + /* Also lookup *:port if we are using hash2 and haven't done so yet. */ + if (use_hash2 && hash2 != hash2_any) { + hash2 = hash2_any; + goto start_lookup; + } + /* * do the slow work with no lock held */ if (count) { flush_stack(stack, count, skb, count - 1); - - for (i = 0; i < count; i++) - sock_put(stack[i]); } else { kfree_skb(skb); } @@ -2523,79 +2516,3 @@ void __init udp_init(void) sysctl_udp_rmem_min = SK_MEM_QUANTUM; sysctl_udp_wmem_min = SK_MEM_QUANTUM; } - -struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - u16 mac_offset = skb->mac_header; - int mac_len = skb->mac_len; - int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - __be16 protocol = skb->protocol; - netdev_features_t enc_features; - int udp_offset, outer_hlen; - unsigned int oldlen; - bool need_csum; - - oldlen = (u16)~skb->len; - - if (unlikely(!pskb_may_pull(skb, tnl_hlen))) - goto out; - - skb->encapsulation = 0; - __skb_pull(skb, tnl_hlen); - skb_reset_mac_header(skb); - skb_set_network_header(skb, skb_inner_network_offset(skb)); - skb->mac_len = skb_inner_network_offset(skb); - skb->protocol = htons(ETH_P_TEB); - - need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); - if (need_csum) - skb->encap_hdr_csum = 1; - - /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); - segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) { - skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, - mac_len); - goto out; - } - - outer_hlen = skb_tnl_header_len(skb); - udp_offset = outer_hlen - tnl_hlen; - skb = segs; - do { - struct udphdr *uh; - int len; - - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - - skb->mac_len = mac_len; - - skb_push(skb, outer_hlen); - skb_reset_mac_header(skb); - skb_set_network_header(skb, mac_len); - skb_set_transport_header(skb, udp_offset); - len = skb->len - udp_offset; - uh = udp_hdr(skb); - uh->len = htons(len); - - if (need_csum) { - __be32 delta = htonl(oldlen + len); - - uh->check = ~csum_fold((__force __wsum) - ((__force u32)uh->check + - (__force u32)delta)); - uh->check = gso_make_checksum(skb, ~uh->check); - - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - } - - skb->protocol = protocol; - } while ((skb = skb->next)); -out: - return segs; -} diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 546d2d439dda..4807544d018b 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -47,6 +47,82 @@ static int udp4_ufo_send_check(struct sk_buff *skb) return 0; } +struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + u16 mac_offset = skb->mac_header; + int mac_len = skb->mac_len; + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + __be16 protocol = skb->protocol; + netdev_features_t enc_features; + int udp_offset, outer_hlen; + unsigned int oldlen; + bool need_csum; + + oldlen = (u16)~skb->len; + + if (unlikely(!pskb_may_pull(skb, tnl_hlen))) + goto out; + + skb->encapsulation = 0; + __skb_pull(skb, tnl_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb_inner_network_offset(skb)); + skb->mac_len = skb_inner_network_offset(skb); + skb->protocol = htons(ETH_P_TEB); + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); + if (need_csum) + skb->encap_hdr_csum = 1; + + /* segment inner packet. */ + enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + segs = skb_mac_gso_segment(skb, enc_features); + if (!segs || IS_ERR(segs)) { + skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, + mac_len); + goto out; + } + + outer_hlen = skb_tnl_header_len(skb); + udp_offset = outer_hlen - tnl_hlen; + skb = segs; + do { + struct udphdr *uh; + int len; + + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + + skb->mac_len = mac_len; + + skb_push(skb, outer_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + skb_set_transport_header(skb, udp_offset); + len = skb->len - udp_offset; + uh = udp_hdr(skb); + uh->len = htons(len); + + if (need_csum) { + __be32 delta = htonl(oldlen + len); + + uh->check = ~csum_fold((__force __wsum) + ((__force u32)uh->check + + (__force u32)delta)); + uh->check = gso_make_checksum(skb, ~uh->check); + + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } + + skb->protocol = protocol; + } while ((skb = skb->next)); +out: + return segs; +} + static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c new file mode 100644 index 000000000000..61ec1a65207e --- /dev/null +++ b/net/ipv4/udp_tunnel.c @@ -0,0 +1,100 @@ +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/socket.h> +#include <linux/udp.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <net/udp.h> +#include <net/udp_tunnel.h> +#include <net/net_namespace.h> + +int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) +{ + int err = -EINVAL; + struct socket *sock = NULL; + +#if IS_ENABLED(CONFIG_IPV6) + if (cfg->family == AF_INET6) { + struct sockaddr_in6 udp6_addr; + + err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); + if (err < 0) + goto error; + + sk_change_net(sock->sk, net); + + udp6_addr.sin6_family = AF_INET6; + memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, + sizeof(udp6_addr.sin6_addr)); + udp6_addr.sin6_port = cfg->local_udp_port; + err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, + sizeof(udp6_addr)); + if (err < 0) + goto error; + + if (cfg->peer_udp_port) { + udp6_addr.sin6_family = AF_INET6; + memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, + sizeof(udp6_addr.sin6_addr)); + udp6_addr.sin6_port = cfg->peer_udp_port; + err = kernel_connect(sock, + (struct sockaddr *)&udp6_addr, + sizeof(udp6_addr), 0); + } + if (err < 0) + goto error; + + udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); + udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); + } else +#endif + if (cfg->family == AF_INET) { + struct sockaddr_in udp_addr; + + err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); + if (err < 0) + goto error; + + sk_change_net(sock->sk, net); + + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->local_ip; + udp_addr.sin_port = cfg->local_udp_port; + err = kernel_bind(sock, (struct sockaddr *)&udp_addr, + sizeof(udp_addr)); + if (err < 0) + goto error; + + if (cfg->peer_udp_port) { + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->peer_ip; + udp_addr.sin_port = cfg->peer_udp_port; + err = kernel_connect(sock, + (struct sockaddr *)&udp_addr, + sizeof(udp_addr), 0); + if (err < 0) + goto error; + } + + sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; + } else { + return -EPFNOSUPPORT; + } + + + *sockp = sock; + + return 0; + +error: + if (sock) { + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); + } + *sockp = NULL; + return err; +} +EXPORT_SYMBOL(udp_sock_create); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5667b3003af9..4c03c2843094 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -186,6 +186,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .accept_ra_from_local = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -222,6 +223,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .accept_ra_from_local = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -2728,9 +2730,25 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr } } +static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) +{ + if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + /* addrconf_add_linklocal also adds a prefix_route and we + * only need to care about prefix routes if ipv6_generate_eui64 + * couldn't generate one. + */ + if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) + addrconf_add_linklocal(idev, &addr); + else if (prefix_route) + addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + } +} + static void addrconf_dev_config(struct net_device *dev) { - struct in6_addr addr; struct inet6_dev *idev; ASSERT_RTNL(); @@ -2751,11 +2769,7 @@ static void addrconf_dev_config(struct net_device *dev) if (IS_ERR(idev)) return; - memset(&addr, 0, sizeof(struct in6_addr)); - addr.s6_addr32[0] = htonl(0xFE800000); - - if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0) - addrconf_add_linklocal(idev, &addr); + addrconf_addr_gen(idev, false); } #if IS_ENABLED(CONFIG_IPV6_SIT) @@ -2777,11 +2791,7 @@ static void addrconf_sit_config(struct net_device *dev) } if (dev->priv_flags & IFF_ISATAP) { - struct in6_addr addr; - - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) - addrconf_add_linklocal(idev, &addr); + addrconf_addr_gen(idev, false); return; } @@ -2796,7 +2806,6 @@ static void addrconf_sit_config(struct net_device *dev) static void addrconf_gre_config(struct net_device *dev) { struct inet6_dev *idev; - struct in6_addr addr; ASSERT_RTNL(); @@ -2805,11 +2814,7 @@ static void addrconf_gre_config(struct net_device *dev) return; } - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) - addrconf_add_linklocal(idev, &addr); - else - addrconf_prefix_route(&addr, 64, dev, 0, 0); + addrconf_addr_gen(idev, true); } #endif @@ -4321,6 +4326,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify; array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; + array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; } static inline size_t inet6_ifla6_size(void) @@ -4420,6 +4426,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (nla == NULL) goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode)) + goto nla_put_failure; + read_lock_bh(&idev->lock); memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); read_unlock_bh(&idev->lock); @@ -4524,8 +4534,21 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL) < 0) BUG(); - if (tb[IFLA_INET6_TOKEN]) + if (tb[IFLA_INET6_TOKEN]) { err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN])); + if (err) + return err; + } + + if (tb[IFLA_INET6_ADDR_GEN_MODE]) { + u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); + + if (mode != IN6_ADDR_GEN_MODE_EUI64 && + mode != IN6_ADDR_GEN_MODE_NONE) + return -EINVAL; + idev->addr_gen_mode = mode; + err = 0; + } return err; } @@ -5168,6 +5191,13 @@ static struct addrconf_sysctl_table .proc_handler = proc_dointvec }, { + .procname = "accept_ra_from_local", + .data = &ipv6_devconf.accept_ra_from_local, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { /* sentinel */ } }, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 7cb4392690dd..2daa3a133e49 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -197,7 +197,7 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->ipv6only = net->ipv6.sysctl.bindv6only; + sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. @@ -294,7 +294,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Binding to v4-mapped address on a v6-only socket * makes no sense */ - if (np->ipv6only) { + if (sk->sk_ipv6only) { err = -EINVAL; goto out; } @@ -371,7 +371,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_type != IPV6_ADDR_ANY) { sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (addr_type != IPV6_ADDR_MAPPED) - np->ipv6only = 1; + sk->sk_ipv6only = 1; } if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; @@ -765,6 +765,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.flowlabel_consistency = 1; + net->ipv6.sysctl.auto_flowlabels = 0; atomic_set(&net->ipv6.rt_genid, 0); err = ipv6_init_mibs(net); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c3bf2d2e519e..2753319524f1 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -199,6 +199,7 @@ ipv4_connected: NULL); sk->sk_state = TCP_ESTABLISHED; + ip6_set_txhash(sk); out: fl6_sock_release(flowlabel); return err; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3873181ed856..5f19dfbc4c6a 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -322,7 +322,8 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, else strcpy(name, "ip6gre%d"); - dev = alloc_netdev(sizeof(*t), name, ip6gre_tunnel_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, + ip6gre_tunnel_setup); if (!dev) return NULL; @@ -723,7 +724,8 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, * Push down and install the IP header. */ ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); ipv6h->hop_limit = tunnel->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; @@ -1174,7 +1176,9 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen); __be16 *p = (__be16 *)(ipv6h+1); - ip6_flow_hdr(ipv6h, 0, t->fl.u.ip6.flowlabel); + ip6_flow_hdr(ipv6h, 0, + ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, false)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; @@ -1323,7 +1327,8 @@ static int __net_init ip6gre_init_net(struct net *net) int err; ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", - ip6gre_tunnel_setup); + NET_NAME_UNKNOWN, + ip6gre_tunnel_setup); if (!ign->fb_tunnel_dev) { err = -ENOMEM; goto err_alloc_dev; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index cb9df0eb4023..2e339d241b69 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -205,7 +205,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); - ip6_flow_hdr(hdr, tclass, fl6->flowlabel); + ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, + np->autoflowlabel)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -800,8 +801,8 @@ slow_path: /* * Copy a block of the IP datagram. */ - if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len)) - BUG(); + BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), + len)); left -= len; fh->frag_off = htons(offset); @@ -1269,8 +1270,7 @@ emsgsize: } } - /* For UDP, check if TX timestamp is enabled */ - if (sk->sk_type == SOCK_DGRAM) + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) sock_tx_timestamp(sk, &tx_flags); /* @@ -1379,12 +1379,6 @@ alloc_new_skb: sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; - else { - /* Only the initial fragment - * is time stamped. - */ - tx_flags = 0; - } } if (skb == NULL) goto error; @@ -1398,8 +1392,9 @@ alloc_new_skb: skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); - if (sk->sk_type == SOCK_DGRAM) - skb_shinfo(skb)->tx_flags = tx_flags; + /* Only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = tx_flags; + tx_flags = 0; /* * Find where to start putting bytes @@ -1569,7 +1564,9 @@ int ip6_push_pending_frames(struct sock *sk) skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel); + ip6_flow_hdr(hdr, np->cork.tclass, + ip6_make_flowlabel(net, skb, fl6->flowlabel, + np->autoflowlabel)); hdr->hop_limit = np->cork.hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index afa082458360..f9de5a695072 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -315,7 +315,8 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) else sprintf(name, "ip6tnl%%d"); - dev = alloc_netdev(sizeof (*t), name, ip6_tnl_dev_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, + ip6_tnl_dev_setup); if (dev == NULL) goto failed; @@ -1046,7 +1047,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; @@ -1772,7 +1774,7 @@ static int __net_init ip6_tnl_init_net(struct net *net) err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", - ip6_tnl_dev_setup); + NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 9aaa6bb229e4..17ee4fc32dfe 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -204,7 +204,7 @@ static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p else sprintf(name, "ip6_vti%%d"); - dev = alloc_netdev(sizeof(*t), name, vti6_dev_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup); if (dev == NULL) goto failed; @@ -1020,7 +1020,7 @@ static int __net_init vti6_init_net(struct net *net) err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0", - vti6_dev_setup); + NET_NAME_UNKNOWN, vti6_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 8250474ab7dc..f9a3fd320d1d 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -744,7 +744,7 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) else sprintf(name, "pim6reg%u", mrt->id); - dev = alloc_netdev(0, name, reg_vif_setup); + dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (dev == NULL) return NULL; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index edb58aff4ae7..0c289982796d 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -235,7 +235,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; - np->ipv6only = valbool; + sk->sk_ipv6only = valbool; retv = 0; break; @@ -834,6 +834,10 @@ pref_skip_coa: np->dontfrag = valbool; retv = 0; break; + case IPV6_AUTOFLOWLABEL: + np->autoflowlabel = valbool; + retv = 0; + break; } release_sock(sk); @@ -1058,7 +1062,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } case IPV6_V6ONLY: - val = np->ipv6only; + val = sk->sk_ipv6only; break; case IPV6_RECVPKTINFO: @@ -1158,7 +1162,6 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; - break; } case IPV6_TRANSPARENT: @@ -1273,6 +1276,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->dontfrag; break; + case IPV6_AUTOFLOWLABEL: + val = np->autoflowlabel; + break; + default: return -ENOPROTOOPT; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 08b367c6b9cf..617f0958e164 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1301,8 +1301,17 @@ int igmp6_event_query(struct sk_buff *skb) len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); len -= skb_network_header_len(skb); - /* Drop queries with not link local source */ - if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + /* RFC3810 6.2 + * Upon reception of an MLD message that contains a Query, the node + * checks if the source address of the message is a valid link-local + * address, if the Hop Limit is set to 1, and if the Router Alert + * option is present in the Hop-By-Hop Options header of the IPv6 + * packet. If any of these checks fails, the packet is dropped. + */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) || + ipv6_hdr(skb)->hop_limit != 1 || + !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) || + IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD)) return -EINVAL; idev = __in6_dev_get(skb->dev); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index ca8d4ea48a5d..339078f95d1b 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1070,6 +1070,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); + ND_PRINTK(2, info, + "RA: %s, dev: %s\n", + __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); return; @@ -1102,13 +1105,21 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - if (!ipv6_accept_ra(in6_dev)) + if (!ipv6_accept_ra(in6_dev)) { + ND_PRINTK(2, info, + "RA: %s, did not accept ra for dev: %s\n", + __func__, skb->dev->name); goto skip_linkparms; + } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ - if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { + ND_PRINTK(2, info, + "RA: %s, nodetype is NODEFAULT, dev: %s\n", + __func__, skb->dev->name); goto skip_linkparms; + } #endif if (in6_dev->if_flags & IF_RS_SENT) { @@ -1130,11 +1141,24 @@ static void ndisc_router_discovery(struct sk_buff *skb) (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); - if (!in6_dev->cnf.accept_ra_defrtr) + if (!in6_dev->cnf.accept_ra_defrtr) { + ND_PRINTK(2, info, + "RA: %s, defrtr is false for dev: %s\n", + __func__, skb->dev->name); goto skip_defrtr; + } - if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + /* Do not accept RA with source-addr found on local machine unless + * accept_ra_from_local is set to true. + */ + if (!in6_dev->cnf.accept_ra_from_local && + ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0)) { + ND_PRINTK(2, info, + "RA from local address detected on dev: %s: default router ignored\n", + skb->dev->name); goto skip_defrtr; + } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); @@ -1163,8 +1187,10 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = NULL; } + ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, for dev: %s\n", + rt, lifetime, skb->dev->name); if (rt == NULL && lifetime) { - ND_PRINTK(3, dbg, "RA: adding default router\n"); + ND_PRINTK(3, info, "RA: adding default router\n"); rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); if (rt == NULL) { @@ -1260,12 +1286,22 @@ skip_linkparms: NEIGH_UPDATE_F_ISROUTER); } - if (!ipv6_accept_ra(in6_dev)) + if (!ipv6_accept_ra(in6_dev)) { + ND_PRINTK(2, info, + "RA: %s, accept_ra is false for dev: %s\n", + __func__, skb->dev->name); goto out; + } #ifdef CONFIG_IPV6_ROUTE_INFO - if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + if (!in6_dev->cnf.accept_ra_from_local && + ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0)) { + ND_PRINTK(2, info, + "RA from local address detected on dev: %s: router info ignored.\n", + skb->dev->name); goto skip_routeinfo; + } if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; @@ -1293,8 +1329,12 @@ skip_routeinfo: #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ - if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { + ND_PRINTK(2, info, + "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", + __func__, skb->dev->name); goto out; + } #endif if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { @@ -1728,7 +1768,7 @@ int __init ndisc_init(void) #ifdef CONFIG_SYSCTL err = neigh_sysctl_register(NULL, &nd_tbl.parms, - &ndisc_ifinfo_sysctl_change); + ndisc_ifinfo_sysctl_change); if (err) goto out_unregister_pernet; out: diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 4bff1f297e39..ac93df16f5af 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -55,6 +55,11 @@ config NFT_REJECT_IPV6 default NFT_REJECT tristate +config NF_LOG_IPV6 + tristate "IPv6 packet logging" + depends on NETFILTER_ADVANCED + select NF_LOG_COMMON + config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" depends on INET && IPV6 diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 70d3dd66f2cd..c0b263104ed2 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -23,6 +23,9 @@ obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o +# logging +obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o + # nf_tables obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 54bd9790603f..8b147440fbdc 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -94,7 +94,6 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) break; default: return false; - break; } nexthdr = hp->nexthdr; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0d5279fd852a..3d4bccf6d67d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -63,6 +63,8 @@ struct nf_ct_frag6_skb_cb static struct inet_frags nf_frags; #ifdef CONFIG_SYSCTL +static int zero; + static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", @@ -76,14 +78,17 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.nf_frag.frags.low_thresh }, { } }; @@ -102,7 +107,10 @@ static int nf_ct_frag6_sysctl_register(struct net *net) table[0].data = &net->nf_frag.frags.timeout; table[1].data = &net->nf_frag.frags.low_thresh; + table[1].extra2 = &net->nf_frag.frags.high_thresh; table[2].data = &net->nf_frag.frags.high_thresh; + table[2].extra1 = &net->nf_frag.frags.low_thresh; + table[2].extra2 = &init_net.nf_frag.frags.high_thresh; } hdr = register_net_sysctl(net, "net/netfilter", table); @@ -147,16 +155,13 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, const struct in6_addr *daddr) { - u32 c; - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); - c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, nf_frags.rnd); - return c & (INETFRAGS_HASHSZ - 1); + return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, nf_frags.rnd); } -static unsigned int nf_hashfn(struct inet_frag_queue *q) +static unsigned int nf_hashfn(const struct inet_frag_queue *q) { const struct frag_queue *nq; @@ -196,7 +201,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id, arg.dst = dst; arg.ecn = ecn; - read_lock_bh(&nf_frags.lock); + local_bh_disable(); hash = nf_hash_frag(id, src, dst); q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); @@ -352,7 +357,6 @@ found: fq->q.last_in |= INET_FRAG_FIRST_IN; } - inet_frag_lru_move(&fq->q); return 0; discard_fq: @@ -597,10 +601,6 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) hdr = ipv6_hdr(clone); fhdr = (struct frag_hdr *)skb_transport_header(clone); - local_bh_disable(); - inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false); - local_bh_enable(); - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, ip6_frag_ecn(hdr)); if (fq == NULL) { @@ -677,7 +677,6 @@ int nf_ct_frag6_init(void) nf_frags.qsize = sizeof(struct frag_queue); nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; - nf_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&nf_frags); ret = register_pernet_subsys(&nf_ct_net_ops); diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c new file mode 100644 index 000000000000..7b17a0be93e7 --- /dev/null +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -0,0 +1,417 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/xt_LOG.h> +#include <net/netfilter/nf_log.h> + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +/* One level of recursion won't kill us */ +static void dump_ipv6_packet(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int ip6hoff, + int recurse) +{ + u_int8_t currenthdr; + int fragment; + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); + if (ih == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ + nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + + /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ + nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, + ih->hop_limit, + (ntohl(*(__be32 *)ih) & 0x000fffff)); + + fragment = 0; + ptr = ip6hoff + sizeof(struct ipv6hdr); + currenthdr = ih->nexthdr; + while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr; + const struct ipv6_opt_hdr *hp; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (hp == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 48 "OPT (...) " */ + if (logflags & XT_LOG_IPOPT) + nf_log_buf_add(m, "OPT ( "); + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr; + const struct frag_hdr *fh; + + nf_log_buf_add(m, "FRAG:"); + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (fh == NULL) { + nf_log_buf_add(m, "TRUNCATED "); + return; + } + + /* Max length: 6 "65535 " */ + nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); + + /* Max length: 11 "INCOMPLETE " */ + if (fh->frag_off & htons(0x0001)) + nf_log_buf_add(m, "INCOMPLETE "); + + nf_log_buf_add(m, "ID:%08x ", + ntohl(fh->identification)); + + if (ntohs(fh->frag_off) & 0xFFF8) + fragment = 1; + + hdrlen = 8; + + break; + } + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + if (fragment) { + if (logflags & XT_LOG_IPOPT) + nf_log_buf_add(m, ")"); + return; + } + hdrlen = ipv6_optlen(hp); + break; + /* Max Length */ + case IPPROTO_AH: + if (logflags & XT_LOG_IPOPT) { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + /* Max length: 3 "AH " */ + nf_log_buf_add(m, "AH "); + + if (fragment) { + nf_log_buf_add(m, ")"); + return; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), + &_ahdr); + if (ah == NULL) { + /* + * Max length: 26 "INCOMPLETE [65535 + * bytes] )" + */ + nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 15 "SPI=0xF1234567 */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); + + } + + hdrlen = (hp->hdrlen+2)<<2; + break; + case IPPROTO_ESP: + if (logflags & XT_LOG_IPOPT) { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 4 "ESP " */ + nf_log_buf_add(m, "ESP "); + + if (fragment) { + nf_log_buf_add(m, ")"); + return; + } + + /* + * Max length: 26 "INCOMPLETE [65535 bytes] )" + */ + eh = skb_header_pointer(skb, ptr, sizeof(_esph), + &_esph); + if (eh == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 16 "SPI=0xF1234567 )" */ + nf_log_buf_add(m, "SPI=0x%x )", + ntohl(eh->spi)); + } + return; + default: + /* Max length: 20 "Unknown Ext Hdr 255" */ + nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr); + return; + } + if (logflags & XT_LOG_IPOPT) + nf_log_buf_add(m, ") "); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + + switch (currenthdr) { + case IPPROTO_TCP: + if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment, + ptr, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr)) + return; + break; + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h; + const struct icmp6hdr *ic; + + /* Max length: 13 "PROTO=ICMPv6 " */ + nf_log_buf_add(m, "PROTO=ICMPv6 "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); + if (ic == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - ptr); + return; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + nf_log_buf_add(m, "TYPE=%u CODE=%u ", + ic->icmp6_type, ic->icmp6_code); + + switch (ic->icmp6_type) { + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + nf_log_buf_add(m, "ID=%u SEQ=%u ", + ntohs(ic->icmp6_identifier), + ntohs(ic->icmp6_sequence)); + break; + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + break; + + case ICMPV6_PARAMPROB: + /* Max length: 17 "POINTER=ffffffff " */ + nf_log_buf_add(m, "POINTER=%08x ", + ntohl(ic->icmp6_pointer)); + /* Fall through */ + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + /* Max length: 3+maxlen */ + if (recurse) { + nf_log_buf_add(m, "["); + dump_ipv6_packet(m, info, skb, + ptr + sizeof(_icmp6h), 0); + nf_log_buf_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) { + nf_log_buf_add(m, "MTU=%u ", + ntohl(ic->icmp6_mtu)); + } + } + break; + } + /* Max length: 10 "PROTO=255 " */ + default: + nf_log_buf_add(m, "PROTO=%u ", currenthdr); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && recurse) + nf_log_dump_sk_uid_gid(m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (recurse && skb->mark) + nf_log_buf_add(m, "MARK=0x%x ", skb->mark); +} + +static void dump_ipv6_mac_header(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + nf_log_buf_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int len = dev->hard_header_len; + unsigned int i; + + if (dev->type == ARPHRD_SIT) { + p -= ETH_HLEN; + + if (p < skb->head) + p = NULL; + } + + if (p != NULL) { + nf_log_buf_add(m, "%02x", *p++); + for (i = 1; i < len; i++) + nf_log_buf_add(m, ":%02x", *p++); + } + nf_log_buf_add(m, " "); + + if (dev->type == ARPHRD_SIT) { + const struct iphdr *iph = + (struct iphdr *)skb_mac_header(skb); + nf_log_buf_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, + &iph->daddr); + } + } else { + nf_log_buf_add(m, " "); + } +} + +static void nf_log_ip6_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, + loginfo, prefix); + + if (in != NULL) + dump_ipv6_mac_header(m, loginfo, skb); + + dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_ip6_logger __read_mostly = { + .name = "nf_log_ipv6", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_ip6_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_ipv6_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); + return 0; +} + +static void __net_exit nf_log_ipv6_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_ip6_logger); +} + +static struct pernet_operations nf_log_ipv6_net_ops = { + .init = nf_log_ipv6_net_init, + .exit = nf_log_ipv6_net_exit, +}; + +static int __init nf_log_ipv6_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_log_ipv6_net_ops); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_IPV6, &nf_ip6_logger); + return 0; +} + +static void __exit nf_log_ipv6_exit(void) +{ + unregister_pernet_subsys(&nf_log_ipv6_net_ops); + nf_log_unregister(&nf_ip6_logger); +} + +module_init(nf_log_ipv6_init); +module_exit(nf_log_ipv6_exit); + +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter IPv4 packet logging"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NF_LOGGER(AF_INET6, 0); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index abfe75a2e316..fc8e49b2ff3e 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -158,6 +158,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, htons(oldlen), htons(datalen), 1); } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) { @@ -175,6 +176,7 @@ static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], return 0; } +#endif static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = { .l3proto = NFPROTO_IPV6, @@ -183,7 +185,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = { .manip_pkt = nf_nat_ipv6_manip_pkt, .csum_update = nf_nat_ipv6_csum_update, .csum_recalc = nf_nat_ipv6_csum_recalc, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_ipv6_nlattr_to_range, +#endif #ifdef CONFIG_XFRM .decode_session = nf_nat_ipv6_decode_session, #endif diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 3317440ea341..2d6f860e5c1e 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -33,6 +33,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; + unsigned int frag_mem = ip6_frag_mem(net); seq_printf(seq, "TCP6: inuse %d\n", sock_prot_inuse_get(net, &tcpv6_prot)); @@ -42,8 +43,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); - seq_printf(seq, "FRAG6: inuse %d memory %d\n", - ip6_frag_nqueues(net), ip6_frag_mem(net)); + seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem); return 0; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index b2dc60b0c764..dee80fb1aa86 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -588,8 +588,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, } offset += skb_transport_offset(skb); - if (skb_copy_bits(skb, offset, &csum, 2)) - BUG(); + BUG_ON(skb_copy_bits(skb, offset, &csum, 2)); /* in case cksum was not initialized */ if (unlikely(csum)) @@ -601,8 +600,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP) csum = CSUM_MANGLED_0; - if (skb_store_bits(skb, offset, &csum, 2)) - BUG(); + BUG_ON(skb_store_bits(skb, offset, &csum, 2)); send: err = ip6_push_pending_frames(sk); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index cc85a9ba5010..f1709c4a289a 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -85,27 +85,23 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, const struct in6_addr *daddr) { - u32 c; - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); - c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, ip6_frags.rnd); - - return c & (INETFRAGS_HASHSZ - 1); + return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, ip6_frags.rnd); } -static unsigned int ip6_hashfn(struct inet_frag_queue *q) +static unsigned int ip6_hashfn(const struct inet_frag_queue *q) { - struct frag_queue *fq; + const struct frag_queue *fq; fq = container_of(q, struct frag_queue, q); return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); } -bool ip6_frag_match(struct inet_frag_queue *q, void *a) +bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) { - struct frag_queue *fq; - struct ip6_create_arg *arg = a; + const struct frag_queue *fq; + const struct ip6_create_arg *arg = a; fq = container_of(q, struct frag_queue, q); return fq->id == arg->id && @@ -115,10 +111,10 @@ bool ip6_frag_match(struct inet_frag_queue *q, void *a) } EXPORT_SYMBOL(ip6_frag_match); -void ip6_frag_init(struct inet_frag_queue *q, void *a) +void ip6_frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); - struct ip6_create_arg *arg = a; + const struct ip6_create_arg *arg = a; fq->id = arg->id; fq->user = arg->user; @@ -145,7 +141,9 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, if (!dev) goto out_rcu_unlock; - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + if (!(fq->q.last_in & INET_FRAG_EVICTED)) + IP6_INC_STATS_BH(net, __in6_dev_get(dev), + IPSTATS_MIB_REASMTIMEOUT); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); /* Don't send error if the first segment did not arrive. */ @@ -192,7 +190,6 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, arg.dst = dst; arg.ecn = ecn; - read_lock(&ip6_frags.lock); hash = inet6_hash_frag(id, src, dst); q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); @@ -353,7 +350,6 @@ found: } skb_dst_drop(skb); - inet_frag_lru_move(&fq->q); return -1; discard_fq: @@ -523,7 +519,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); - int evicted; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; @@ -552,11 +547,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false); - if (evicted) - IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_REASMFAILS, evicted); - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, ip6_frag_ecn(hdr)); if (fq != NULL) { @@ -588,20 +578,25 @@ static const struct inet6_protocol frag_protocol = }; #ifdef CONFIG_SYSCTL +static int zero; + static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .data = &init_net.ipv6.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ipv6.frags.high_thresh }, { .procname = "ip6frag_time", @@ -613,10 +608,12 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { { } }; +/* secret interval has been deprecated */ +static int ip6_frags_secret_interval_unused; static struct ctl_table ip6_frags_ctl_table[] = { { .procname = "ip6frag_secret_interval", - .data = &ip6_frags.secret_interval, + .data = &ip6_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -636,7 +633,10 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) goto err_alloc; table[0].data = &net->ipv6.frags.high_thresh; + table[0].extra1 = &net->ipv6.frags.low_thresh; + table[0].extra2 = &init_net.ipv6.frags.high_thresh; table[1].data = &net->ipv6.frags.low_thresh; + table[1].extra2 = &net->ipv6.frags.high_thresh; table[2].data = &net->ipv6.frags.timeout; /* Don't export sysctls to unprivileged users */ @@ -746,7 +746,6 @@ int __init ipv6_frag_init(void) ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.match = ip6_frag_match; ip6_frags.frag_expire = ip6_frag_expire; - ip6_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip6_frags); out: return ret; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 4f408176dc64..2e9ba035fb5f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -250,7 +250,8 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, else strcpy(name, "sit%d"); - dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, + ipip6_tunnel_setup); if (dev == NULL) return NULL; @@ -1729,6 +1730,7 @@ static int __net_init sit_init_net(struct net *net) sitn->tunnels[3] = sitn->tunnels_r_l; sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", + NET_NAME_UNKNOWN, ipip6_tunnel_setup); if (!sitn->fb_tunnel_dev) { err = -ENOMEM; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index a822b880689b..83cea1d39466 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -187,7 +187,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet6_reqsk_alloc(&tcp6_request_sock_ops); + req = inet_reqsk_alloc(&tcp6_request_sock_ops); if (!req) goto out; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 058f3eca2e53..5bf7b61f8ae8 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -39,6 +39,13 @@ static struct ctl_table ipv6_table_template[] = { .proc_handler = proc_dointvec }, { + .procname = "auto_flowlabels", + .data = &init_net.ipv6.sysctl.auto_flowlabels, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "fwmark_reflect", .data = &init_net.ipv6.sysctl.fwmark_reflect, .maxlen = sizeof(int), @@ -74,6 +81,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[0].data = &net->ipv6.sysctl.bindv6only; ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply; ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency; + ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 229239ad96b1..22055b098428 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -198,6 +198,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_v6_daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; + ip6_set_txhash(sk); + /* * TCP over IPv4 */ @@ -470,13 +472,14 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, - struct flowi6 *fl6, + struct flowi *fl, struct request_sock *req, u16 queue_mapping, struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; @@ -503,18 +506,6 @@ done: return err; } -static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) -{ - struct flowi6 fl6; - int res; - - res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL); - if (!res) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - } - return res; -} static void tcp_v6_reqsk_destructor(struct request_sock *req) { @@ -718,22 +709,66 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) } #endif +static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, + struct sk_buff *skb) +{ + struct inet_request_sock *ireq = inet_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + + ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; + ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + + ireq->ir_iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq->ir_iif = inet6_iif(skb); + + if (!TCP_SKB_CB(skb)->when && + (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || + np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || + np->rxopt.bits.rxohlim || np->repflow)) { + atomic_inc(&skb->users); + ireq->pktopts = skb; + } +} + +static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl, + const struct request_sock *req, + bool *strict) +{ + if (strict) + *strict = true; + return inet6_csk_route_req(sk, &fl->u.ip6, req); +} + struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), - .rtx_syn_ack = tcp_v6_rtx_synack, + .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, .syn_ack_timeout = tcp_syn_ack_timeout, }; -#ifdef CONFIG_TCP_MD5SIG static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { + .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - + sizeof(struct ipv6hdr), +#ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v6_reqsk_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, -}; #endif + .init_req = tcp_v6_init_req, +#ifdef CONFIG_SYN_COOKIES + .cookie_init_seq = cookie_v6_init_sequence, +#endif + .route_req = tcp_v6_route_req, + .init_seq = tcp_v6_init_sequence, + .send_synack = tcp_v6_send_synack, + .queue_hash_add = inet6_csk_reqsk_queue_hash_add, +}; static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, @@ -973,153 +1008,17 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) return sk; } -/* FIXME: this is substantially similar to the ipv4 code. - * Can some kind of merge be done? -- erics - */ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp_options_received tmp_opt; - struct request_sock *req; - struct inet_request_sock *ireq; - struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - __u32 isn = TCP_SKB_CB(skb)->when; - struct dst_entry *dst = NULL; - struct tcp_fastopen_cookie foc = { .len = -1 }; - bool want_cookie = false, fastopen; - struct flowi6 fl6; - int err; - if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; - if ((sysctl_tcp_syncookies == 2 || - inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6"); - if (!want_cookie) - goto drop; - } - - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); - goto drop; - } - - req = inet6_reqsk_alloc(&tcp6_request_sock_ops); - if (req == NULL) - goto drop; - -#ifdef CONFIG_TCP_MD5SIG - tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; -#endif - - tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); - tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); + return tcp_conn_request(&tcp6_request_sock_ops, + &tcp_request_sock_ipv6_ops, sk, skb); - if (want_cookie && !tmp_opt.saw_tstamp) - tcp_clear_options(&tmp_opt); - - tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb); - - ireq = inet_rsk(req); - ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; - ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sock_net(sk)); - - ireq->ir_iif = sk->sk_bound_dev_if; - ireq->ir_mark = inet_request_mark(sk, skb); - - /* So that link locals have meaning */ - if (!sk->sk_bound_dev_if && - ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) - ireq->ir_iif = inet6_iif(skb); - - if (!isn) { - if (ipv6_opt_accepted(sk, skb) || - np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || - np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || - np->repflow) { - atomic_inc(&skb->users); - ireq->pktopts = skb; - } - - if (want_cookie) { - isn = cookie_v6_init_sequence(sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - goto have_isn; - } - - /* VJ's idea. We save last timestamp seen - * from the destination in peer table, when entering - * state TIME-WAIT, and check against it before - * accepting new connection request. - * - * If "isn" is not zero, this request hit alive - * timewait bucket, so that all the necessary checks - * are made in the function processing timewait state. - */ - if (tmp_opt.saw_tstamp && - tcp_death_row.sysctl_tw_recycle && - (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) { - if (!tcp_peer_is_proven(req, dst, true)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); - goto drop_and_release; - } - } - /* Kill the following clause, if you dislike this way. */ - else if (!sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst, false)) { - /* Without syncookies last quarter of - * backlog is filled with destinations, - * proven to be alive. - * It means that we continue to communicate - * to destinations, already remembered - * to the moment of synflood. - */ - LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n", - &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source)); - goto drop_and_release; - } - - isn = tcp_v6_init_sequence(skb); - } -have_isn: - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_release; - - if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL) - goto drop_and_free; - - tcp_rsk(req)->snt_isn = isn; - tcp_rsk(req)->snt_synack = tcp_time_stamp; - tcp_openreq_init_rwin(req, sk, dst); - fastopen = !want_cookie && - tcp_try_fastopen(sk, skb, req, &foc, dst); - err = tcp_v6_send_synack(sk, dst, &fl6, req, - skb_get_queue_mapping(skb), &foc); - if (!fastopen) { - if (err || want_cookie) - goto drop_and_free; - - tcp_rsk(req)->listener = NULL; - inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - } - return 0; - -drop_and_release: - dst_release(dst); -drop_and_free: - reqsk_free(req); drop: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return 0; /* don't send reset */ @@ -1235,6 +1134,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; + ip6_set_txhash(newsk); + /* Now IPv6 options... First: no IPv4 options. diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 8517d3cd1aed..01b0ff9a0c2c 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -73,7 +73,7 @@ static int tcp6_gro_complete(struct sk_buff *skb, int thoff) th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr, &iph->daddr, 0); - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; return tcp_gro_complete(skb); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 95c834799288..5b6091de5868 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -79,7 +79,6 @@ static unsigned int udp6_ehashfn(struct net *net, int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) { const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - int sk_ipv6only = ipv6_only_sock(sk); int sk2_ipv6only = inet_v6_ipv6only(sk2); int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; @@ -95,7 +94,7 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) return 1; if (addr_type == IPV6_ADDR_ANY && - !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) + !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) return 1; if (sk2_rcv_saddr6 && @@ -674,8 +673,11 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto csum_error; } - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; + } skb_dst_drop(skb); @@ -690,6 +692,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) bh_unlock_sock(sk); return rc; + csum_error: UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: @@ -699,43 +702,26 @@ drop: return -1; } -static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, - __be16 loc_port, const struct in6_addr *loc_addr, - __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif) +static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, + __be16 loc_port, const struct in6_addr *loc_addr, + __be16 rmt_port, const struct in6_addr *rmt_addr, + int dif, unsigned short hnum) { - struct hlist_nulls_node *node; - unsigned short num = ntohs(loc_port); - - sk_nulls_for_each_from(sk, node) { - struct inet_sock *inet = inet_sk(sk); - - if (!net_eq(sock_net(sk), net)) - continue; - - if (udp_sk(sk)->udp_port_hash == num && - sk->sk_family == PF_INET6) { - if (inet->inet_dport) { - if (inet->inet_dport != rmt_port) - continue; - } - if (!ipv6_addr_any(&sk->sk_v6_daddr) && - !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) - continue; - - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) - continue; + struct inet_sock *inet = inet_sk(sk); - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) - continue; - } - if (!inet6_mc_check(sk, loc_addr, rmt_addr)) - continue; - return sk; - } - } - return NULL; + if (!net_eq(sock_net(sk), net)) + return false; + + if (udp_sk(sk)->udp_port_hash != hnum || + sk->sk_family != PF_INET6 || + (inet->inet_dport && inet->inet_dport != rmt_port) || + (!ipv6_addr_any(&sk->sk_v6_daddr) && + !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || + (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + return false; + if (!inet6_mc_check(sk, loc_addr, rmt_addr)) + return false; + return true; } static void flush_stack(struct sock **stack, unsigned int count, @@ -759,6 +745,7 @@ static void flush_stack(struct sock **stack, unsigned int count, if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) skb1 = NULL; + sock_put(sk); } if (unlikely(skb1)) kfree_skb(skb1); @@ -784,43 +771,51 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, { struct sock *sk, *stack[256 / sizeof(struct sock *)]; const struct udphdr *uh = udp_hdr(skb); - struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); - int dif; - unsigned int i, count = 0; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(uh->dest); + struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); + int dif = inet6_iif(skb); + unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + + if (use_hash2) { + hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & + udp_table.mask; + hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask; +start_lookup: + hslot = &udp_table.hash2[hash2]; + offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); + } spin_lock(&hslot->lock); - sk = sk_nulls_head(&hslot->head); - dif = inet6_iif(skb); - sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - while (sk) { - /* If zero checksum and no_check is not on for - * the socket then skip it. - */ - if (uh->check || udp_sk(sk)->no_check6_rx) + sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { + if (__udp_v6_is_mcast_sock(net, sk, + uh->dest, daddr, + uh->source, saddr, + dif, hnum) && + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ + (uh->check || udp_sk(sk)->no_check6_rx)) { + if (unlikely(count == ARRAY_SIZE(stack))) { + flush_stack(stack, count, skb, ~0); + count = 0; + } stack[count++] = sk; - - sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, - uh->source, saddr, dif); - if (unlikely(count == ARRAY_SIZE(stack))) { - if (!sk) - break; - flush_stack(stack, count, skb, ~0); - count = 0; + sock_hold(sk); } } - /* - * before releasing the lock, we must take reference on sockets - */ - for (i = 0; i < count; i++) - sock_hold(stack[i]); spin_unlock(&hslot->lock); + /* Also lookup *:port if we are using hash2 and haven't done so yet. */ + if (use_hash2 && hash2 != hash2_any) { + hash2 = hash2_any; + goto start_lookup; + } + if (count) { flush_stack(stack, count, skb, count - 1); - - for (i = 0; i < count; i++) - sock_put(stack[i]); } else { kfree_skb(skb); } diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 54747c25c86c..92fafd485deb 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -674,7 +674,6 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name) self->daddr = DEV_ADDR_ANY; kfree(discoveries); return -EHOSTUNREACH; - break; } } /* Cleanup our copy of the discovery log */ diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c index 365b895da84b..9e0d909390fd 100644 --- a/net/irda/irda_device.c +++ b/net/irda/irda_device.c @@ -293,7 +293,8 @@ static void irda_device_setup(struct net_device *dev) */ struct net_device *alloc_irdadev(int sizeof_priv) { - return alloc_netdev(sizeof_priv, "irda%d", irda_device_setup); + return alloc_netdev(sizeof_priv, "irda%d", NET_NAME_UNKNOWN, + irda_device_setup); } EXPORT_SYMBOL(alloc_irdadev); diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c index 7ac4d1becbfc..1bc49edf2296 100644 --- a/net/irda/irlan/irlan_common.c +++ b/net/irda/irlan/irlan_common.c @@ -1024,7 +1024,6 @@ static int __irlan_insert_param(struct sk_buff *skb, char *param, int type, default: IRDA_DEBUG(2, "%s(), Unknown parameter type!\n", __func__ ); return 0; - break; } /* Insert at end of sk-buffer */ diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index ffcec225b5d9..dc13f1a45f2f 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -96,7 +96,7 @@ static void irlan_eth_setup(struct net_device *dev) */ struct net_device *alloc_irlandev(const char *name) { - return alloc_netdev(sizeof(struct irlan_cb), name, + return alloc_netdev(sizeof(struct irlan_cb), name, NET_NAME_UNKNOWN, irlan_eth_setup); } diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index 98ad6ec4bd3c..a5f28d421ea8 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c @@ -1426,7 +1426,8 @@ __u8 *irlmp_hint_to_service(__u8 *hint) if (hint[1] & HINT_TELEPHONY) { IRDA_DEBUG(1, "Telephony "); service[i++] = S_TELEPHONY; - } if (hint[1] & HINT_FILE_SERVER) + } + if (hint[1] & HINT_FILE_SERVER) IRDA_DEBUG(1, "File Server "); if (hint[1] & HINT_COMM) { diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 7a95fa4a3de1..a089b6b91650 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1103,7 +1103,6 @@ static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock, default: err = -EINVAL; goto out; - break; } } @@ -1543,7 +1542,8 @@ static int iucv_sock_shutdown(struct socket *sock, int how) sk->sk_shutdown |= how; if (how == RCV_SHUTDOWN || how == SHUTDOWN_MASK) { - if (iucv->transport == AF_IUCV_TRANS_IUCV) { + if ((iucv->transport == AF_IUCV_TRANS_IUCV) && + iucv->path) { err = pr_iucv->path_quiesce(iucv->path, NULL); if (err) err = -ENOTCONN; diff --git a/net/key/af_key.c b/net/key/af_key.c index ba2a2f95911c..1847ec4e3930 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -405,7 +405,6 @@ static int verify_address_len(const void *p) * XXX When it can, remove this -EINVAL. -DaveM */ return -EINVAL; - break; } return 0; @@ -536,7 +535,6 @@ pfkey_satype2proto(uint8_t satype) return IPPROTO_ESP; case SADB_X_SATYPE_IPCOMP: return IPPROTO_COMP; - break; default: return 0; } @@ -553,7 +551,6 @@ pfkey_proto2satype(uint16_t proto) return SADB_SATYPE_ESP; case IPPROTO_COMP: return SADB_X_SATYPE_IPCOMP; - break; default: return 0; } diff --git a/net/l2tp/Kconfig b/net/l2tp/Kconfig index adb9843dd7cf..378c73b26093 100644 --- a/net/l2tp/Kconfig +++ b/net/l2tp/Kconfig @@ -6,6 +6,7 @@ menuconfig L2TP tristate "Layer Two Tunneling Protocol (L2TP)" depends on (IPV6 || IPV6=n) depends on INET + select NET_UDP_TUNNEL ---help--- Layer Two Tunneling Protocol diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index bea259043205..1109d3bb8dac 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -52,6 +52,7 @@ #include <net/dst.h> #include <net/ip.h> #include <net/udp.h> +#include <net/udp_tunnel.h> #include <net/inet_common.h> #include <net/xfrm.h> #include <net/protocol.h> @@ -1358,81 +1359,46 @@ static int l2tp_tunnel_sock_create(struct net *net, { int err = -EINVAL; struct socket *sock = NULL; - struct sockaddr_in udp_addr = {0}; - struct sockaddr_l2tpip ip_addr = {0}; -#if IS_ENABLED(CONFIG_IPV6) - struct sockaddr_in6 udp6_addr = {0}; - struct sockaddr_l2tpip6 ip6_addr = {0}; -#endif + struct udp_port_cfg udp_conf; switch (cfg->encap) { case L2TP_ENCAPTYPE_UDP: + memset(&udp_conf, 0, sizeof(udp_conf)); + #if IS_ENABLED(CONFIG_IPV6) if (cfg->local_ip6 && cfg->peer_ip6) { - err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); - if (err < 0) - goto out; - - sk_change_net(sock->sk, net); - - udp6_addr.sin6_family = AF_INET6; - memcpy(&udp6_addr.sin6_addr, cfg->local_ip6, - sizeof(udp6_addr.sin6_addr)); - udp6_addr.sin6_port = htons(cfg->local_udp_port); - err = kernel_bind(sock, (struct sockaddr *) &udp6_addr, - sizeof(udp6_addr)); - if (err < 0) - goto out; - - udp6_addr.sin6_family = AF_INET6; - memcpy(&udp6_addr.sin6_addr, cfg->peer_ip6, - sizeof(udp6_addr.sin6_addr)); - udp6_addr.sin6_port = htons(cfg->peer_udp_port); - err = kernel_connect(sock, - (struct sockaddr *) &udp6_addr, - sizeof(udp6_addr), 0); - if (err < 0) - goto out; - - if (cfg->udp6_zero_tx_checksums) - udp_set_no_check6_tx(sock->sk, true); - if (cfg->udp6_zero_rx_checksums) - udp_set_no_check6_rx(sock->sk, true); + udp_conf.family = AF_INET6; + memcpy(&udp_conf.local_ip6, cfg->local_ip6, + sizeof(udp_conf.local_ip6)); + memcpy(&udp_conf.peer_ip6, cfg->peer_ip6, + sizeof(udp_conf.peer_ip6)); + udp_conf.use_udp6_tx_checksums = + cfg->udp6_zero_tx_checksums; + udp_conf.use_udp6_rx_checksums = + cfg->udp6_zero_rx_checksums; } else #endif { - err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); - if (err < 0) - goto out; - - sk_change_net(sock->sk, net); - - udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->local_ip; - udp_addr.sin_port = htons(cfg->local_udp_port); - err = kernel_bind(sock, (struct sockaddr *) &udp_addr, - sizeof(udp_addr)); - if (err < 0) - goto out; - - udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->peer_ip; - udp_addr.sin_port = htons(cfg->peer_udp_port); - err = kernel_connect(sock, - (struct sockaddr *) &udp_addr, - sizeof(udp_addr), 0); - if (err < 0) - goto out; + udp_conf.family = AF_INET; + udp_conf.local_ip = cfg->local_ip; + udp_conf.peer_ip = cfg->peer_ip; + udp_conf.use_udp_checksums = cfg->use_udp_checksums; } - if (!cfg->use_udp_checksums) - sock->sk->sk_no_check_tx = 1; + udp_conf.local_udp_port = htons(cfg->local_udp_port); + udp_conf.peer_udp_port = htons(cfg->peer_udp_port); + + err = udp_sock_create(net, &udp_conf, &sock); + if (err < 0) + goto out; break; case L2TP_ENCAPTYPE_IP: #if IS_ENABLED(CONFIG_IPV6) if (cfg->local_ip6 && cfg->peer_ip6) { + struct sockaddr_l2tpip6 ip6_addr = {0}; + err = sock_create_kern(AF_INET6, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) @@ -1461,6 +1427,8 @@ static int l2tp_tunnel_sock_create(struct net *net, } else #endif { + struct sockaddr_l2tpip ip_addr = {0}; + err = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 76125c57ee6d..edb78e69efe4 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -246,7 +246,8 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p goto out; } - dev = alloc_netdev(sizeof(*priv), name, l2tp_eth_dev_setup); + dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN, + l2tp_eth_dev_setup); if (!dev) { rc = -ENOMEM; goto out_del_session; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 950909f04ee6..13752d96275e 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1365,7 +1365,7 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, int err; if (level != SOL_PPPOL2TP) - return udp_prot.setsockopt(sk, level, optname, optval, optlen); + return -EINVAL; if (optlen < sizeof(int)) return -EINVAL; @@ -1491,7 +1491,7 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, struct pppol2tp_session *ps; if (level != SOL_PPPOL2TP) - return udp_prot.getsockopt(sk, level, optname, optval, optlen); + return -EINVAL; if (get_user(len, optlen)) return -EFAULT; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 29be8854a027..01eede7406a5 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1653,9 +1653,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, if (local->hw.queues >= IEEE80211_NUM_ACS) txqs = IEEE80211_NUM_ACS; - ndev = alloc_netdev_mqs(sizeof(*sdata) + - local->hw.vif_data_size, - name, ieee80211_if_setup, txqs, 1); + ndev = alloc_netdev_mqs(sizeof(*sdata) + local->hw.vif_data_size, + name, NET_NAME_UNKNOWN, + ieee80211_if_setup, txqs, 1); if (!ndev) return -ENOMEM; dev_net_set(ndev, wiphy_net(local->hw.wiphy)); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 94758b9c9ed4..214e63b84e5c 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -157,7 +157,6 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, default: kfree_skb(skb); return -ENOTSUPP; - break; } *pos++ = ie_len; *pos++ = flags; diff --git a/net/mac802154/ieee802154_dev.c b/net/mac802154/ieee802154_dev.c index 2cf66d885e68..b36b2b996578 100644 --- a/net/mac802154/ieee802154_dev.c +++ b/net/mac802154/ieee802154_dev.c @@ -143,6 +143,7 @@ static void mac802154_del_iface(struct wpan_phy *phy, struct net_device *dev) { struct mac802154_sub_if_data *sdata; + ASSERT_RTNL(); sdata = netdev_priv(dev); @@ -166,11 +167,13 @@ mac802154_add_iface(struct wpan_phy *phy, const char *name, int type) switch (type) { case IEEE802154_DEV_MONITOR: dev = alloc_netdev(sizeof(struct mac802154_sub_if_data), - name, mac802154_monitor_setup); + name, NET_NAME_UNKNOWN, + mac802154_monitor_setup); break; case IEEE802154_DEV_WPAN: dev = alloc_netdev(sizeof(struct mac802154_sub_if_data), - name, mac802154_wpan_setup); + name, NET_NAME_UNKNOWN, + mac802154_wpan_setup); break; default: dev = NULL; @@ -276,7 +279,8 @@ ieee802154_alloc_device(size_t priv_data_len, struct ieee802154_ops *ops) } priv = wpan_phy_priv(phy); - priv->hw.phy = priv->phy = phy; + priv->phy = phy; + priv->hw.phy = priv->phy; priv->hw.priv = (char *)priv + ALIGN(sizeof(*priv), NETDEV_ALIGN); priv->ops = ops; @@ -302,29 +306,61 @@ EXPORT_SYMBOL(ieee802154_free_device); int ieee802154_register_device(struct ieee802154_dev *dev) { struct mac802154_priv *priv = mac802154_to_priv(dev); - int rc = -ENOMEM; + int rc = -ENOSYS; + + if (dev->flags & IEEE802154_HW_TXPOWER) { + if (!priv->ops->set_txpower) + goto out; + + priv->phy->set_txpower = mac802154_set_txpower; + } + + if (dev->flags & IEEE802154_HW_LBT) { + if (!priv->ops->set_lbt) + goto out; + + priv->phy->set_lbt = mac802154_set_lbt; + } + + if (dev->flags & IEEE802154_HW_CCA_MODE) { + if (!priv->ops->set_cca_mode) + goto out; + + priv->phy->set_cca_mode = mac802154_set_cca_mode; + } + + if (dev->flags & IEEE802154_HW_CCA_ED_LEVEL) { + if (!priv->ops->set_cca_ed_level) + goto out; + + priv->phy->set_cca_ed_level = mac802154_set_cca_ed_level; + } + + if (dev->flags & IEEE802154_HW_CSMA_PARAMS) { + if (!priv->ops->set_csma_params) + goto out; + + priv->phy->set_csma_params = mac802154_set_csma_params; + } + + if (dev->flags & IEEE802154_HW_FRAME_RETRIES) { + if (!priv->ops->set_frame_retries) + goto out; + + priv->phy->set_frame_retries = mac802154_set_frame_retries; + } priv->dev_workqueue = create_singlethread_workqueue(wpan_phy_name(priv->phy)); - if (!priv->dev_workqueue) + if (!priv->dev_workqueue) { + rc = -ENOMEM; goto out; + } wpan_phy_set_dev(priv->phy, priv->hw.parent); priv->phy->add_iface = mac802154_add_iface; priv->phy->del_iface = mac802154_del_iface; - if (priv->ops->set_txpower) - priv->phy->set_txpower = mac802154_set_txpower; - if (priv->ops->set_lbt) - priv->phy->set_lbt = mac802154_set_lbt; - if (priv->ops->set_cca_mode) - priv->phy->set_cca_mode = mac802154_set_cca_mode; - if (priv->ops->set_cca_ed_level) - priv->phy->set_cca_ed_level = mac802154_set_cca_ed_level; - if (priv->ops->set_csma_params) - priv->phy->set_csma_params = mac802154_set_csma_params; - if (priv->ops->set_frame_retries) - priv->phy->set_frame_retries = mac802154_set_frame_retries; rc = wpan_phy_register(priv->phy); if (rc < 0) diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index 1456f73b02b9..457058142098 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -538,6 +538,7 @@ static int llsec_recover_addr(struct mac802154_llsec *sec, struct ieee802154_addr *addr) { __le16 caddr = sec->params.coord_shortaddr; + addr->pan_id = sec->params.pan_id; if (caddr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) { diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index 15aa2f2b03a7..868a040fd422 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -175,9 +175,9 @@ static void phy_chan_notify(struct work_struct *work) mutex_lock(&priv->hw->phy->pib_lock); res = hw->ops->set_channel(&hw->hw, priv->page, priv->chan); - if (res) + if (res) { pr_debug("set_channel failed\n"); - else { + } else { priv->hw->phy->current_channel = priv->chan; priv->hw->phy->current_page = priv->page; } @@ -210,8 +210,9 @@ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan) INIT_WORK(&work->work, phy_chan_notify); work->dev = dev; queue_work(priv->hw->dev_workqueue, &work->work); - } else + } else { mutex_unlock(&priv->hw->phy->pib_lock); + } } diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 6d1647399d4f..8124353646ae 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -98,6 +98,7 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb, if (!(priv->hw.flags & IEEE802154_HW_OMIT_CKSUM)) { u16 crc = crc_ccitt(0, skb->data, skb->len); u8 *data = skb_put(skb, 2); + data[0] = crc & 0xff; data[1] = crc >> 8; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e9410d17619d..ad751fe2e82b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -46,6 +46,9 @@ config NF_CONNTRACK To compile it as a module, choose M here. If unsure, say N. +config NF_LOG_COMMON + tristate + if NF_CONNTRACK config NF_CONNTRACK_MARK @@ -744,6 +747,7 @@ config NETFILTER_XT_TARGET_LED config NETFILTER_XT_TARGET_LOG tristate "LOG target support" + depends on NF_LOG_IPV4 && NF_LOG_IPV6 default m if NETFILTER_ADVANCED=n help This option adds a `LOG' target, which allows you to create rules in diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index bffdad774da7..8308624a406a 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -47,6 +47,9 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \ nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o +# generic transport layer logging +obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o + obj-$(CONFIG_NF_NAT) += nf_nat.o # NAT protocols (nf_nat) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c42e83d2751c..8416307fdd1d 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1807,92 +1807,6 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec, }, #endif -#if 0 - { - .procname = "timeout_established", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_synsent", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_synrecv", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_finwait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_timewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_close", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_closewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_lastack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_listen", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_synack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_udp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_icmp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, -#endif { } }; @@ -3778,6 +3692,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); + ip_vs_stop_estimator(net, &ipvs->tot_stats); } #else @@ -3840,7 +3755,6 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); ip_vs_trash_cleanup(net); - ip_vs_stop_estimator(net, &ipvs->tot_stats); ip_vs_control_net_cleanup_sysctl(net); remove_proc_entry("ip_vs_stats_percpu", net->proc_net); remove_proc_entry("ip_vs_stats", net->proc_net); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index db801263ee9f..eadffb29dec0 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -886,8 +886,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); rcu_read_unlock(); if (!cp) { - if (param->pe_data) - kfree(param->pe_data); + kfree(param->pe_data); IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); return; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1f4f954c4b47..de88c4ab5146 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -352,40 +352,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) local_bh_enable(); } -static void death_by_event(unsigned long ul_conntrack) -{ - struct nf_conn *ct = (void *)ul_conntrack; - struct net *net = nf_ct_net(ct); - struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); - - BUG_ON(ecache == NULL); - - if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { - /* bad luck, let's retry again */ - ecache->timeout.expires = jiffies + - (prandom_u32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ecache->timeout); - return; - } - /* we've got the event delivered, now it's dying */ - set_bit(IPS_DYING_BIT, &ct->status); - nf_ct_put(ct); -} - -static void nf_ct_dying_timeout(struct nf_conn *ct) -{ - struct net *net = nf_ct_net(ct); - struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); - - BUG_ON(ecache == NULL); - - /* set a new timer to retry event delivery */ - setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); - ecache->timeout.expires = jiffies + - (prandom_u32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ecache->timeout); -} - bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; @@ -394,15 +360,20 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) if (tstamp && tstamp->stop == 0) tstamp->stop = ktime_to_ns(ktime_get_real()); - if (!nf_ct_is_dying(ct) && - unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct, - portid, report) < 0)) { + if (nf_ct_is_dying(ct)) + goto delete; + + if (nf_conntrack_event_report(IPCT_DESTROY, ct, + portid, report) < 0) { /* destroy event was not delivered */ nf_ct_delete_from_lists(ct); - nf_ct_dying_timeout(ct); + nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); return false; } + + nf_conntrack_ecache_work(nf_ct_net(ct)); set_bit(IPS_DYING_BIT, &ct->status); + delete: nf_ct_delete_from_lists(ct); nf_ct_put(ct); return true; @@ -1464,26 +1435,6 @@ void nf_conntrack_flush_report(struct net *net, u32 portid, int report) } EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); -static void nf_ct_release_dying_list(struct net *net) -{ - struct nf_conntrack_tuple_hash *h; - struct nf_conn *ct; - struct hlist_nulls_node *n; - int cpu; - - for_each_possible_cpu(cpu) { - struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - - spin_lock_bh(&pcpu->lock); - hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - /* never fails to remove them, no listeners at this point */ - nf_ct_kill(ct); - } - spin_unlock_bh(&pcpu->lock); - } -} - static int untrack_refs(void) { int cnt = 0, cpu; @@ -1548,7 +1499,6 @@ i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0); - nf_ct_release_dying_list(net); if (atomic_read(&net->ct.count) != 0) busy = 1; } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 1df176146567..4e78c57b818f 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -29,6 +29,90 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); +#define ECACHE_RETRY_WAIT (HZ/10) + +enum retry_state { + STATE_CONGESTED, + STATE_RESTART, + STATE_DONE, +}; + +static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) +{ + struct nf_conn *refs[16]; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + unsigned int evicted = 0; + enum retry_state ret = STATE_DONE; + + spin_lock(&pcpu->lock); + + hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + if (nf_ct_is_dying(ct)) + continue; + + if (nf_conntrack_event(IPCT_DESTROY, ct)) { + ret = STATE_CONGESTED; + break; + } + + /* we've got the event delivered, now it's dying */ + set_bit(IPS_DYING_BIT, &ct->status); + refs[evicted] = ct; + + if (++evicted >= ARRAY_SIZE(refs)) { + ret = STATE_RESTART; + break; + } + } + + spin_unlock(&pcpu->lock); + + /* can't _put while holding lock */ + while (evicted) + nf_ct_put(refs[--evicted]); + + return ret; +} + +static void ecache_work(struct work_struct *work) +{ + struct netns_ct *ctnet = + container_of(work, struct netns_ct, ecache_dwork.work); + int cpu, delay = -1; + struct ct_pcpu *pcpu; + + local_bh_disable(); + + for_each_possible_cpu(cpu) { + enum retry_state ret; + + pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu); + + ret = ecache_work_evict_list(pcpu); + + switch (ret) { + case STATE_CONGESTED: + delay = ECACHE_RETRY_WAIT; + goto out; + case STATE_RESTART: + delay = 0; + break; + case STATE_DONE: + break; + } + } + + out: + local_bh_enable(); + + ctnet->ecache_dwork_pending = delay > 0; + if (delay >= 0) + schedule_delayed_work(&ctnet->ecache_dwork, delay); +} + /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) @@ -157,7 +241,6 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); #define NF_CT_EVENTS_DEFAULT 1 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; -static int nf_ct_events_retry_timeout __read_mostly = 15*HZ; #ifdef CONFIG_SYSCTL static struct ctl_table event_sysctl_table[] = { @@ -168,13 +251,6 @@ static struct ctl_table event_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "nf_conntrack_events_retry_timeout", - .data = &init_net.ct.sysctl_events_retry_timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, {} }; #endif /* CONFIG_SYSCTL */ @@ -196,7 +272,6 @@ static int nf_conntrack_event_init_sysctl(struct net *net) goto out; table[0].data = &net->ct.sysctl_events; - table[1].data = &net->ct.sysctl_events_retry_timeout; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) @@ -238,12 +313,13 @@ static void nf_conntrack_event_fini_sysctl(struct net *net) int nf_conntrack_ecache_pernet_init(struct net *net) { net->ct.sysctl_events = nf_ct_events; - net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout; + INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work); return nf_conntrack_event_init_sysctl(net); } void nf_conntrack_ecache_pernet_fini(struct net *net) { + cancel_delayed_work_sync(&net->ct.ecache_dwork); nf_conntrack_event_fini_sysctl(net); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 58579634427d..355a5c4ef763 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -597,6 +597,9 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) ; @@ -742,8 +745,7 @@ static int ctnetlink_done(struct netlink_callback *cb) { if (cb->args[1]) nf_ct_put((struct nf_conn *)cb->args[1]); - if (cb->data) - kfree(cb->data); + kfree(cb->data); return 0; } @@ -1150,7 +1152,7 @@ static int ctnetlink_done_list(struct netlink_callback *cb) static int ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying) { - struct nf_conn *ct, *last = NULL; + struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); @@ -1163,8 +1165,7 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying if (cb->args[2]) return 0; - if (cb->args[0] == nr_cpu_ids) - return 0; + last = (struct nf_conn *)cb->args[1]; for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { struct ct_pcpu *pcpu; @@ -1174,7 +1175,6 @@ ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); spin_lock_bh(&pcpu->lock); - last = (struct nf_conn *)cb->args[1]; list = dying ? &pcpu->dying : &pcpu->unconfirmed; restart: hlist_nulls_for_each_entry(h, n, list, hnnode) { @@ -1193,7 +1193,9 @@ restart: ct); rcu_read_unlock(); if (res < 0) { - nf_conntrack_get(&ct->ct_general); + if (!atomic_inc_not_zero(&ct->ct_general.use)) + continue; + cb->args[0] = cpu; cb->args[1] = (unsigned long)ct; spin_unlock_bh(&pcpu->lock); goto out; @@ -1202,10 +1204,10 @@ restart: if (cb->args[1]) { cb->args[1] = 0; goto restart; - } else - cb->args[2] = 1; + } spin_unlock_bh(&pcpu->lock); } + cb->args[2] = 1; out: if (last) nf_ct_put(last); @@ -2040,6 +2042,9 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ +#endif + ctnetlink_proto_size(ct) ; } diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 85296d4eac0e..daad6022c689 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -16,16 +16,22 @@ #define NF_LOG_PREFIXLEN 128 #define NFLOGGER_NAME_LEN 64 -static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; +static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); static struct nf_logger *__find_logger(int pf, const char *str_logger) { - struct nf_logger *t; + struct nf_logger *log; + int i; + + for (i = 0; i < NF_LOG_TYPE_MAX; i++) { + if (loggers[pf][i] == NULL) + continue; - list_for_each_entry(t, &nf_loggers_l[pf], list[pf]) { - if (!strnicmp(str_logger, t->name, strlen(t->name))) - return t; + log = rcu_dereference_protected(loggers[pf][i], + lockdep_is_held(&nf_log_mutex)); + if (!strnicmp(str_logger, log->name, strlen(log->name))) + return log; } return NULL; @@ -73,17 +79,14 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger) if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers)) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(logger->list); i++) - INIT_LIST_HEAD(&logger->list[i]); - mutex_lock(&nf_log_mutex); if (pf == NFPROTO_UNSPEC) { for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) - list_add_tail(&(logger->list[i]), &(nf_loggers_l[i])); + rcu_assign_pointer(loggers[i][logger->type], logger); } else { /* register at end of list to honor first register win */ - list_add_tail(&logger->list[pf], &nf_loggers_l[pf]); + rcu_assign_pointer(loggers[pf][logger->type], logger); } mutex_unlock(&nf_log_mutex); @@ -98,7 +101,7 @@ void nf_log_unregister(struct nf_logger *logger) mutex_lock(&nf_log_mutex); for (i = 0; i < NFPROTO_NUMPROTO; i++) - list_del(&logger->list[i]); + RCU_INIT_POINTER(loggers[i][logger->type], NULL); mutex_unlock(&nf_log_mutex); } EXPORT_SYMBOL(nf_log_unregister); @@ -129,6 +132,48 @@ void nf_log_unbind_pf(struct net *net, u_int8_t pf) } EXPORT_SYMBOL(nf_log_unbind_pf); +void nf_logger_request_module(int pf, enum nf_log_type type) +{ + if (loggers[pf][type] == NULL) + request_module("nf-logger-%u-%u", pf, type); +} +EXPORT_SYMBOL_GPL(nf_logger_request_module); + +int nf_logger_find_get(int pf, enum nf_log_type type) +{ + struct nf_logger *logger; + int ret = -ENOENT; + + logger = loggers[pf][type]; + if (logger == NULL) + request_module("nf-logger-%u-%u", pf, type); + + rcu_read_lock(); + logger = rcu_dereference(loggers[pf][type]); + if (logger == NULL) + goto out; + + if (logger && try_module_get(logger->me)) + ret = 0; +out: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(nf_logger_find_get); + +void nf_logger_put(int pf, enum nf_log_type type) +{ + struct nf_logger *logger; + + BUG_ON(loggers[pf][type] == NULL); + + rcu_read_lock(); + logger = rcu_dereference(loggers[pf][type]); + module_put(logger->me); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_logger_put); + void nf_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, @@ -143,7 +188,11 @@ void nf_log_packet(struct net *net, const struct nf_logger *logger; rcu_read_lock(); - logger = rcu_dereference(net->nf.nf_loggers[pf]); + if (loginfo != NULL) + logger = rcu_dereference(loggers[pf][loginfo->type]); + else + logger = rcu_dereference(net->nf.nf_loggers[pf]); + if (logger) { va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); @@ -154,6 +203,63 @@ void nf_log_packet(struct net *net, } EXPORT_SYMBOL(nf_log_packet); +#define S_SIZE (1024 - (sizeof(unsigned int) + 1)) + +struct nf_log_buf { + unsigned int count; + char buf[S_SIZE + 1]; +}; +static struct nf_log_buf emergency, *emergency_ptr = &emergency; + +__printf(2, 3) int nf_log_buf_add(struct nf_log_buf *m, const char *f, ...) +{ + va_list args; + int len; + + if (likely(m->count < S_SIZE)) { + va_start(args, f); + len = vsnprintf(m->buf + m->count, S_SIZE - m->count, f, args); + va_end(args); + if (likely(m->count + len < S_SIZE)) { + m->count += len; + return 0; + } + } + m->count = S_SIZE; + printk_once(KERN_ERR KBUILD_MODNAME " please increase S_SIZE\n"); + return -1; +} +EXPORT_SYMBOL_GPL(nf_log_buf_add); + +struct nf_log_buf *nf_log_buf_open(void) +{ + struct nf_log_buf *m = kmalloc(sizeof(*m), GFP_ATOMIC); + + if (unlikely(!m)) { + local_bh_disable(); + do { + m = xchg(&emergency_ptr, NULL); + } while (!m); + } + m->count = 0; + return m; +} +EXPORT_SYMBOL_GPL(nf_log_buf_open); + +void nf_log_buf_close(struct nf_log_buf *m) +{ + m->buf[m->count] = 0; + printk("%s\n", m->buf); + + if (likely(m != &emergency)) + kfree(m); + else { + emergency_ptr = m; + local_bh_enable(); + } +} +EXPORT_SYMBOL_GPL(nf_log_buf_close); + #ifdef CONFIG_PROC_FS static void *seq_start(struct seq_file *seq, loff_t *pos) { @@ -188,8 +294,7 @@ static int seq_show(struct seq_file *s, void *v) { loff_t *pos = v; const struct nf_logger *logger; - struct nf_logger *t; - int ret; + int i, ret; struct net *net = seq_file_net(s); logger = rcu_dereference_protected(net->nf.nf_loggers[*pos], @@ -203,11 +308,16 @@ static int seq_show(struct seq_file *s, void *v) if (ret < 0) return ret; - list_for_each_entry(t, &nf_loggers_l[*pos], list[*pos]) { - ret = seq_printf(s, "%s", t->name); + for (i = 0; i < NF_LOG_TYPE_MAX; i++) { + if (loggers[*pos][i] == NULL) + continue; + + logger = rcu_dereference_protected(loggers[*pos][i], + lockdep_is_held(&nf_log_mutex)); + ret = seq_printf(s, "%s", logger->name); if (ret < 0) return ret; - if (&t->list[*pos] != nf_loggers_l[*pos].prev) { + if (i == 0 && loggers[*pos][i + 1] != NULL) { ret = seq_printf(s, ","); if (ret < 0) return ret; @@ -389,14 +499,5 @@ static struct pernet_operations nf_log_net_ops = { int __init netfilter_log_init(void) { - int i, ret; - - ret = register_pernet_subsys(&nf_log_net_ops); - if (ret < 0) - return ret; - - for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) - INIT_LIST_HEAD(&(nf_loggers_l[i])); - - return 0; + return register_pernet_subsys(&nf_log_net_ops); } diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c new file mode 100644 index 000000000000..eeb8ef4ff1a3 --- /dev/null +++ b/net/netfilter/nf_log_common.c @@ -0,0 +1,187 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/xt_LOG.h> +#include <net/netfilter/nf_log.h> + +int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb, + u8 proto, int fragment, unsigned int offset) +{ + struct udphdr _udph; + const struct udphdr *uh; + + if (proto == IPPROTO_UDP) + /* Max length: 10 "PROTO=UDP " */ + nf_log_buf_add(m, "PROTO=UDP "); + else /* Max length: 14 "PROTO=UDPLITE " */ + nf_log_buf_add(m, "PROTO=UDPLITE "); + + if (fragment) + goto out; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (uh == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); + +out: + return 0; +} +EXPORT_SYMBOL_GPL(nf_log_dump_udp_header); + +int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, + u8 proto, int fragment, unsigned int offset, + unsigned int logflags) +{ + struct tcphdr _tcph; + const struct tcphdr *th; + + /* Max length: 10 "PROTO=TCP " */ + nf_log_buf_add(m, "PROTO=TCP "); + + if (fragment) + return 0; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (th == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + nf_log_buf_add(m, "SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & XT_LOG_TCPSEQ) { + nf_log_buf_add(m, "SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + } + + /* Max length: 13 "WINDOW=65535 " */ + nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3C " */ + nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & + TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + nf_log_buf_add(m, "CWR "); + if (th->ece) + nf_log_buf_add(m, "ECE "); + if (th->urg) + nf_log_buf_add(m, "URG "); + if (th->ack) + nf_log_buf_add(m, "ACK "); + if (th->psh) + nf_log_buf_add(m, "PSH "); + if (th->rst) + nf_log_buf_add(m, "RST "); + if (th->syn) + nf_log_buf_add(m, "SYN "); + if (th->fin) + nf_log_buf_add(m, "FIN "); + /* Max length: 11 "URGP=65535 " */ + nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { + u_int8_t _opt[60 - sizeof(struct tcphdr)]; + const u_int8_t *op; + unsigned int i; + unsigned int optsize = th->doff*4 - sizeof(struct tcphdr); + + op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), + optsize, _opt); + if (op == NULL) { + nf_log_buf_add(m, "OPT (TRUNCATED)"); + return 1; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + nf_log_buf_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + nf_log_buf_add(m, "%02X", op[i]); + + nf_log_buf_add(m, ") "); + } + + return 0; +} +EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); + +void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk) +{ + if (!sk || sk->sk_state == TCP_TIME_WAIT) + return; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + const struct cred *cred = sk->sk_socket->file->f_cred; + nf_log_buf_add(m, "UID=%u GID=%u ", + from_kuid_munged(&init_user_ns, cred->fsuid), + from_kgid_munged(&init_user_ns, cred->fsgid)); + } + read_unlock_bh(&sk->sk_callback_lock); +} +EXPORT_SYMBOL_GPL(nf_log_dump_sk_uid_gid); + +void +nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, const char *prefix) +{ + nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", + '0' + loginfo->u.log.level, prefix, + in ? in->name : "", + out ? out->name : ""); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + const struct net_device *physindev; + const struct net_device *physoutdev; + + physindev = skb->nf_bridge->physindev; + if (physindev && in != physindev) + nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); + physoutdev = skb->nf_bridge->physoutdev; + if (physoutdev && out != physoutdev) + nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); + } +#endif +} +EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); + +static int __init nf_log_common_init(void) +{ + return 0; +} + +static void __exit nf_log_common_exit(void) {} + +module_init(nf_log_common_init); +module_exit(nf_log_common_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 09096a670c45..552f97cd9fde 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -525,6 +525,39 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data) return i->status & IPS_NAT_MASK ? 1 : 0; } +static int nf_nat_proto_clean(struct nf_conn *ct, void *data) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + + if (nf_nat_proto_remove(ct, data)) + return 1; + + if (!nat || !nat->ct) + return 0; + + /* This netns is being destroyed, and conntrack has nat null binding. + * Remove it from bysource hash, as the table will be freed soon. + * + * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() + * will delete entry from already-freed table. + */ + if (!del_timer(&ct->timeout)) + return 1; + + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&nat->bysource); + ct->status &= ~IPS_NAT_DONE_MASK; + nat->ct = NULL; + spin_unlock_bh(&nf_nat_lock); + + add_timer(&ct->timeout); + + /* don't delete conntrack. Although that would make things a lot + * simpler, we'd end up flushing all conntracks on nat rmmod. + */ + return 0; +} + static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) { struct nf_nat_proto_clean clean = { @@ -677,7 +710,7 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { .flags = NF_CT_EXT_F_PREALLOC, }; -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> @@ -795,7 +828,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; - nf_ct_iterate_cleanup(net, &nf_nat_proto_remove, &clean, 0, 0); + nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); synchronize_rcu(); nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c index 83a72a235cae..fbce552a796e 100644 --- a/net/netfilter/nf_nat_proto_common.c +++ b/net/netfilter/nf_nat_proto_common.c @@ -95,7 +95,7 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, } EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple); -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) { diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c index c8be2cdac0bf..b8067b53ff3a 100644 --- a/net/netfilter/nf_nat_proto_dccp.c +++ b/net/netfilter/nf_nat_proto_dccp.c @@ -78,7 +78,7 @@ static const struct nf_nat_l4proto nf_nat_l4proto_dccp = { .manip_pkt = dccp_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = dccp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c index 754536f2c674..cbc7ade1487b 100644 --- a/net/netfilter/nf_nat_proto_sctp.c +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -59,7 +59,7 @@ static const struct nf_nat_l4proto nf_nat_l4proto_sctp = { .manip_pkt = sctp_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = sctp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c index 83ec8a6e4c36..37f5505f4529 100644 --- a/net/netfilter/nf_nat_proto_tcp.c +++ b/net/netfilter/nf_nat_proto_tcp.c @@ -79,7 +79,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_tcp = { .manip_pkt = tcp_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = tcp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c index 7df613fb34a2..b0ede2f0d8bc 100644 --- a/net/netfilter/nf_nat_proto_udp.c +++ b/net/netfilter/nf_nat_proto_udp.c @@ -70,7 +70,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_udp = { .manip_pkt = udp_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = udp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c index 776a0d1317b1..368f14e01e75 100644 --- a/net/netfilter/nf_nat_proto_udplite.c +++ b/net/netfilter/nf_nat_proto_udplite.c @@ -69,7 +69,7 @@ static const struct nf_nat_l4proto nf_nat_l4proto_udplite = { .manip_pkt = udplite_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = udplite_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 624e083125b9..8746ff9a8357 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -35,7 +35,7 @@ int nft_register_afinfo(struct net *net, struct nft_af_info *afi) { INIT_LIST_HEAD(&afi->tables); nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail(&afi->list, &net->nft.af_info); + list_add_tail_rcu(&afi->list, &net->nft.af_info); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } @@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(nft_register_afinfo); void nft_unregister_afinfo(struct nft_af_info *afi) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del(&afi->list); + list_del_rcu(&afi->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_afinfo); @@ -277,11 +277,14 @@ static int nf_tables_dump_tables(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; - list_for_each_entry(afi, &net->nft.af_info, list) { + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { if (family != NFPROTO_UNSPEC && family != afi->family) continue; - list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { if (idx < s_idx) goto cont; if (idx > s_idx) @@ -294,11 +297,14 @@ static int nf_tables_dump_tables(struct sk_buff *skb, NLM_F_MULTI, afi->family, table) < 0) goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } done: + rcu_read_unlock(); cb->args[0] = idx; return skb->len; } @@ -407,6 +413,9 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if (flags & ~NFT_TABLE_F_DORMANT) return -EINVAL; + if (flags == ctx->table->flags) + return 0; + trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, sizeof(struct nft_trans_table)); if (trans == NULL) @@ -514,7 +523,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, module_put(afi->owner); return err; } - list_add_tail(&table->list, &afi->tables); + list_add_tail_rcu(&table->list, &afi->tables); return 0; } @@ -546,7 +555,7 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, if (err < 0) return err; - list_del(&table->list); + list_del_rcu(&table->list); return 0; } @@ -635,13 +644,20 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) { struct nft_stats *cpu_stats, total; struct nlattr *nest; + unsigned int seq; + u64 pkts, bytes; int cpu; memset(&total, 0, sizeof(total)); for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); - total.pkts += cpu_stats->pkts; - total.bytes += cpu_stats->bytes; + do { + seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + pkts = cpu_stats->pkts; + bytes = cpu_stats->bytes; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + total.pkts += pkts; + total.bytes += bytes; } nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS); if (nest == NULL) @@ -761,12 +777,15 @@ static int nf_tables_dump_chains(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; - list_for_each_entry(afi, &net->nft.af_info, list) { + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { if (family != NFPROTO_UNSPEC && family != afi->family) continue; - list_for_each_entry(table, &afi->tables, list) { - list_for_each_entry(chain, &table->chains, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(chain, &table->chains, list) { if (idx < s_idx) goto cont; if (idx > s_idx) @@ -778,17 +797,19 @@ static int nf_tables_dump_chains(struct sk_buff *skb, NLM_F_MULTI, afi->family, table, chain) < 0) goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } } done: + rcu_read_unlock(); cb->args[0] = idx; return skb->len; } - static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -861,7 +882,7 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) return ERR_PTR(-EINVAL); - newstats = alloc_percpu(struct nft_stats); + newstats = netdev_alloc_pcpu_stats(struct nft_stats); if (newstats == NULL) return ERR_PTR(-ENOMEM); @@ -1077,7 +1098,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, } basechain->stats = stats; } else { - stats = alloc_percpu(struct nft_stats); + stats = netdev_alloc_pcpu_stats(struct nft_stats); if (IS_ERR(stats)) { module_put(type->owner); kfree(basechain); @@ -1130,7 +1151,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, goto err2; table->use++; - list_add_tail(&chain->list, &table->chains); + list_add_tail_rcu(&chain->list, &table->chains); return 0; err2: if (!(table->flags & NFT_TABLE_F_DORMANT) && @@ -1180,7 +1201,7 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, return err; table->use--; - list_del(&chain->list); + list_del_rcu(&chain->list); return 0; } @@ -1199,9 +1220,9 @@ int nft_register_expr(struct nft_expr_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); if (type->family == NFPROTO_UNSPEC) - list_add_tail(&type->list, &nf_tables_expressions); + list_add_tail_rcu(&type->list, &nf_tables_expressions); else - list_add(&type->list, &nf_tables_expressions); + list_add_rcu(&type->list, &nf_tables_expressions); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } @@ -1216,7 +1237,7 @@ EXPORT_SYMBOL_GPL(nft_register_expr); void nft_unregister_expr(struct nft_expr_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del(&type->list); + list_del_rcu(&type->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_expr); @@ -1549,16 +1570,17 @@ static int nf_tables_dump_rules(struct sk_buff *skb, unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; - u8 genctr = ACCESS_ONCE(net->nft.genctr); - u8 gencursor = ACCESS_ONCE(net->nft.gencursor); - list_for_each_entry(afi, &net->nft.af_info, list) { + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { if (family != NFPROTO_UNSPEC && family != afi->family) continue; - list_for_each_entry(table, &afi->tables, list) { - list_for_each_entry(chain, &table->chains, list) { - list_for_each_entry(rule, &chain->rules, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(chain, &table->chains, list) { + list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_rule_is_active(net, rule)) goto cont; if (idx < s_idx) @@ -1572,6 +1594,8 @@ static int nf_tables_dump_rules(struct sk_buff *skb, NLM_F_MULTI | NLM_F_APPEND, afi->family, table, chain, rule) < 0) goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } @@ -1579,9 +1603,7 @@ cont: } } done: - /* Invalidate this dump, a transition to the new generation happened */ - if (gencursor != net->nft.gencursor || genctr != net->nft.genctr) - return -EBUSY; + rcu_read_unlock(); cb->args[0] = idx; return skb->len; @@ -1730,6 +1752,9 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, if (!create || nlh->nlmsg_flags & NLM_F_REPLACE) return -EINVAL; handle = nf_tables_alloc_handle(table); + + if (chain->use == UINT_MAX) + return -EOVERFLOW; } if (nla[NFTA_RULE_POSITION]) { @@ -1789,14 +1814,15 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_REPLACE) { if (nft_rule_is_active_next(net, old_rule)) { - trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, + trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, old_rule); if (trans == NULL) { err = -ENOMEM; goto err2; } nft_rule_disactivate_next(net, old_rule); - list_add_tail(&rule->list, &old_rule->list); + chain->use--; + list_add_tail_rcu(&rule->list, &old_rule->list); } else { err = -ENOENT; goto err2; @@ -1826,6 +1852,7 @@ err3: list_del_rcu(&nft_trans_rule(trans)->list); nft_rule_clear(net, nft_trans_rule(trans)); nft_trans_destroy(trans); + chain->use++; } err2: nf_tables_rule_destroy(&ctx, rule); @@ -1927,7 +1954,7 @@ static LIST_HEAD(nf_tables_set_ops); int nft_register_set(struct nft_set_ops *ops) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail(&ops->list, &nf_tables_set_ops); + list_add_tail_rcu(&ops->list, &nf_tables_set_ops); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } @@ -1936,7 +1963,7 @@ EXPORT_SYMBOL_GPL(nft_register_set); void nft_unregister_set(struct nft_set_ops *ops) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del(&ops->list); + list_del_rcu(&ops->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_set); @@ -2229,7 +2256,10 @@ static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb, if (cb->args[1]) return skb->len; - list_for_each_entry(set, &ctx->table->sets, list) { + rcu_read_lock(); + cb->seq = ctx->net->nft.base_seq; + + list_for_each_entry_rcu(set, &ctx->table->sets, list) { if (idx < s_idx) goto cont; if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, @@ -2237,11 +2267,13 @@ static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb, cb->args[0] = idx; goto done; } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } cb->args[1] = 1; done: + rcu_read_unlock(); return skb->len; } @@ -2255,7 +2287,10 @@ static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, if (cb->args[1]) return skb->len; - list_for_each_entry(table, &ctx->afi->tables, list) { + rcu_read_lock(); + cb->seq = ctx->net->nft.base_seq; + + list_for_each_entry_rcu(table, &ctx->afi->tables, list) { if (cur_table) { if (cur_table != table) continue; @@ -2264,7 +2299,7 @@ static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, } ctx->table = table; idx = 0; - list_for_each_entry(set, &ctx->table->sets, list) { + list_for_each_entry_rcu(set, &ctx->table->sets, list) { if (idx < s_idx) goto cont; if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, @@ -2273,12 +2308,14 @@ static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, cb->args[2] = (unsigned long) table; goto done; } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } cb->args[1] = 1; done: + rcu_read_unlock(); return skb->len; } @@ -2295,7 +2332,10 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, if (cb->args[1]) return skb->len; - list_for_each_entry(afi, &net->nft.af_info, list) { + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { if (cur_family) { if (afi->family != cur_family) continue; @@ -2303,7 +2343,7 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, cur_family = 0; } - list_for_each_entry(table, &afi->tables, list) { + list_for_each_entry_rcu(table, &afi->tables, list) { if (cur_table) { if (cur_table != table) continue; @@ -2314,7 +2354,7 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, ctx->table = table; ctx->afi = afi; idx = 0; - list_for_each_entry(set, &ctx->table->sets, list) { + list_for_each_entry_rcu(set, &ctx->table->sets, list) { if (idx < s_idx) goto cont; if (nf_tables_fill_set(skb, ctx, set, @@ -2325,6 +2365,7 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, cb->args[3] = afi->family; goto done; } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } @@ -2334,6 +2375,7 @@ cont: } cb->args[1] = 1; done: + rcu_read_unlock(); return skb->len; } @@ -2592,7 +2634,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (err < 0) goto err2; - list_add_tail(&set->list, &table->sets); + list_add_tail_rcu(&set->list, &table->sets); table->use++; return 0; @@ -2612,7 +2654,7 @@ static void nft_set_destroy(struct nft_set *set) static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { - list_del(&set->list); + list_del_rcu(&set->list); nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); nft_set_destroy(set); } @@ -2647,7 +2689,7 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, if (err < 0) return err; - list_del(&set->list); + list_del_rcu(&set->list); ctx.table->use--; return 0; } @@ -2699,14 +2741,14 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, } bind: binding->chain = ctx->chain; - list_add_tail(&binding->list, &set->bindings); + list_add_tail_rcu(&binding->list, &set->bindings); return 0; } void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { - list_del(&binding->list); + list_del_rcu(&binding->list); if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS && !(set->flags & NFT_SET_INACTIVE)) @@ -2845,7 +2887,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = NFPROTO_UNSPEC; + nfmsg->nfgen_family = ctx.afi->family; nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; @@ -3341,7 +3383,7 @@ static int nf_tables_commit(struct sk_buff *skb) struct nft_set *set; /* Bump generation counter, invalidate any dump in progress */ - net->nft.genctr++; + while (++net->nft.base_seq == 0); /* A new generation has just started */ net->nft.gencursor = gencursor_next(net); @@ -3486,12 +3528,12 @@ static int nf_tables_abort(struct sk_buff *skb) } nft_trans_destroy(trans); } else { - list_del(&trans->ctx.table->list); + list_del_rcu(&trans->ctx.table->list); } break; case NFT_MSG_DELTABLE: - list_add_tail(&trans->ctx.table->list, - &trans->ctx.afi->tables); + list_add_tail_rcu(&trans->ctx.table->list, + &trans->ctx.afi->tables); nft_trans_destroy(trans); break; case NFT_MSG_NEWCHAIN: @@ -3502,7 +3544,7 @@ static int nf_tables_abort(struct sk_buff *skb) nft_trans_destroy(trans); } else { trans->ctx.table->use--; - list_del(&trans->ctx.chain->list); + list_del_rcu(&trans->ctx.chain->list); if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && trans->ctx.chain->flags & NFT_BASE_CHAIN) { nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, @@ -3512,8 +3554,8 @@ static int nf_tables_abort(struct sk_buff *skb) break; case NFT_MSG_DELCHAIN: trans->ctx.table->use++; - list_add_tail(&trans->ctx.chain->list, - &trans->ctx.table->chains); + list_add_tail_rcu(&trans->ctx.chain->list, + &trans->ctx.table->chains); nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: @@ -3527,12 +3569,12 @@ static int nf_tables_abort(struct sk_buff *skb) break; case NFT_MSG_NEWSET: trans->ctx.table->use--; - list_del(&nft_trans_set(trans)->list); + list_del_rcu(&nft_trans_set(trans)->list); break; case NFT_MSG_DELSET: trans->ctx.table->use++; - list_add_tail(&nft_trans_set(trans)->list, - &trans->ctx.table->sets); + list_add_tail_rcu(&nft_trans_set(trans)->list, + &trans->ctx.table->sets); nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: @@ -3946,6 +3988,7 @@ static int nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.af_info); INIT_LIST_HEAD(&net->nft.commit_list); + net->nft.base_seq = 1; return 0; } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 345acfb1720b..3b90eb2b2c55 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -109,7 +109,7 @@ nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) struct nft_data data[NFT_REG_MAX + 1]; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; - struct nft_stats __percpu *stats; + struct nft_stats *stats; int rulenum; /* * Cache cursor to avoid problems in case that the cursor is updated @@ -205,9 +205,11 @@ next_rule: nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); rcu_read_lock_bh(); - stats = rcu_dereference(nft_base_chain(basechain)->stats); - __this_cpu_inc(stats->pkts); - __this_cpu_add(stats->bytes, pkt->skb->len); + stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats)); + u64_stats_update_begin(&stats->syncp); + stats->pkts++; + stats->bytes += pkt->skb->len; + u64_stats_update_end(&stats->syncp); rcu_read_unlock_bh(); return nft_base_chain(basechain)->policy; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index d292c8d286eb..a11c5ff2f720 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -773,6 +773,7 @@ nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, static struct nf_logger nfulnl_logger __read_mostly = { .name = "nfnetlink_log", + .type = NF_LOG_TYPE_ULOG, .logfn = &nfulnl_log_packet, .me = THIS_MODULE, }; @@ -1105,6 +1106,9 @@ MODULE_DESCRIPTION("netfilter userspace logging"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG); +MODULE_ALIAS_NF_LOGGER(AF_INET, 1); +MODULE_ALIAS_NF_LOGGER(AF_INET6, 1); +MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1); module_init(nfnetlink_log_init); module_exit(nfnetlink_log_fini); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 8a779be832fb..1840989092ed 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -195,6 +195,15 @@ static void nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_tgdtor_param par; + + par.net = ctx->net; + par.target = target; + par.targinfo = info; + par.family = ctx->afi->family; + if (par.target->destroy != NULL) + par.target->destroy(&par); module_put(target->me); } @@ -382,6 +391,15 @@ static void nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct xt_match *match = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_mtdtor_param par; + + par.net = ctx->net; + par.match = match; + par.matchinfo = info; + par.family = ctx->afi->family; + if (par.match->destroy != NULL) + par.match->destroy(&par); module_put(match->me); } diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 10cfb156cdf4..bde05f28cf14 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012-2014 Pablo Neira Ayuso <pablo@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -41,6 +42,8 @@ static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { [NFTA_LOG_PREFIX] = { .type = NLA_STRING }, [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 }, [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 }, + [NFTA_LOG_LEVEL] = { .type = NLA_U32 }, + [NFTA_LOG_FLAGS] = { .type = NLA_U32 }, }; static int nft_log_init(const struct nft_ctx *ctx, @@ -50,6 +53,7 @@ static int nft_log_init(const struct nft_ctx *ctx, struct nft_log *priv = nft_expr_priv(expr); struct nf_loginfo *li = &priv->loginfo; const struct nlattr *nla; + int ret; nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { @@ -57,30 +61,74 @@ static int nft_log_init(const struct nft_ctx *ctx, if (priv->prefix == NULL) return -ENOMEM; nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1); - } else + } else { priv->prefix = (char *)nft_log_null_prefix; + } - li->type = NF_LOG_TYPE_ULOG; + li->type = NF_LOG_TYPE_LOG; + if (tb[NFTA_LOG_LEVEL] != NULL && + tb[NFTA_LOG_GROUP] != NULL) + return -EINVAL; if (tb[NFTA_LOG_GROUP] != NULL) + li->type = NF_LOG_TYPE_ULOG; + + switch (li->type) { + case NF_LOG_TYPE_LOG: + if (tb[NFTA_LOG_LEVEL] != NULL) { + li->u.log.level = + ntohl(nla_get_be32(tb[NFTA_LOG_LEVEL])); + } else { + li->u.log.level = 4; + } + if (tb[NFTA_LOG_FLAGS] != NULL) { + li->u.log.logflags = + ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS])); + } + break; + case NF_LOG_TYPE_ULOG: li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP])); + if (tb[NFTA_LOG_SNAPLEN] != NULL) { + li->u.ulog.copy_len = + ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); + } + if (tb[NFTA_LOG_QTHRESHOLD] != NULL) { + li->u.ulog.qthreshold = + ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD])); + } + break; + } - if (tb[NFTA_LOG_SNAPLEN] != NULL) - li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); - if (tb[NFTA_LOG_QTHRESHOLD] != NULL) { - li->u.ulog.qthreshold = - ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD])); + if (ctx->afi->family == NFPROTO_INET) { + ret = nf_logger_find_get(NFPROTO_IPV4, li->type); + if (ret < 0) + return ret; + + ret = nf_logger_find_get(NFPROTO_IPV6, li->type); + if (ret < 0) { + nf_logger_put(NFPROTO_IPV4, li->type); + return ret; + } + return 0; } - return 0; + return nf_logger_find_get(ctx->afi->family, li->type); } static void nft_log_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_log *priv = nft_expr_priv(expr); + struct nf_loginfo *li = &priv->loginfo; if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); + + if (ctx->afi->family == NFPROTO_INET) { + nf_logger_put(NFPROTO_IPV4, li->type); + nf_logger_put(NFPROTO_IPV6, li->type); + } else { + nf_logger_put(ctx->afi->family, li->type); + } } static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -91,17 +139,33 @@ static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) if (priv->prefix != nft_log_null_prefix) if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix)) goto nla_put_failure; - if (li->u.ulog.group) - if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) - goto nla_put_failure; - if (li->u.ulog.copy_len) - if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, - htonl(li->u.ulog.copy_len))) + switch (li->type) { + case NF_LOG_TYPE_LOG: + if (nla_put_be32(skb, NFTA_LOG_LEVEL, htonl(li->u.log.level))) goto nla_put_failure; - if (li->u.ulog.qthreshold) - if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD, - htons(li->u.ulog.qthreshold))) + + if (li->u.log.logflags) { + if (nla_put_be32(skb, NFTA_LOG_FLAGS, + htonl(li->u.log.logflags))) + goto nla_put_failure; + } + break; + case NF_LOG_TYPE_ULOG: + if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) goto nla_put_failure; + + if (li->u.ulog.copy_len) { + if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, + htonl(li->u.ulog.copy_len))) + goto nla_put_failure; + } + if (li->u.ulog.qthreshold) { + if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD, + htons(li->u.ulog.qthreshold))) + goto nla_put_failure; + } + break; + } return 0; nla_put_failure: diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index a0195d28bcfc..79ff58cd36dc 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -175,12 +175,14 @@ static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_be32(skb, NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max))) goto nla_put_failure; - if (nla_put_be32(skb, - NFTA_NAT_REG_PROTO_MIN, htonl(priv->sreg_proto_min))) - goto nla_put_failure; - if (nla_put_be32(skb, - NFTA_NAT_REG_PROTO_MAX, htonl(priv->sreg_proto_max))) - goto nla_put_failure; + if (priv->sreg_proto_min) { + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MIN, + htonl(priv->sreg_proto_min))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_NAT_REG_PROTO_MAX, + htonl(priv->sreg_proto_max))) + goto nla_put_failure; + } return 0; nla_put_failure: diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 227aa11e8409..47b978bc3100 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -711,28 +711,15 @@ void xt_free_table_info(struct xt_table_info *info) { int cpu; - for_each_possible_cpu(cpu) { - if (info->size <= PAGE_SIZE) - kfree(info->entries[cpu]); - else - vfree(info->entries[cpu]); - } + for_each_possible_cpu(cpu) + kvfree(info->entries[cpu]); if (info->jumpstack != NULL) { - if (sizeof(void *) * info->stacksize > PAGE_SIZE) { - for_each_possible_cpu(cpu) - vfree(info->jumpstack[cpu]); - } else { - for_each_possible_cpu(cpu) - kfree(info->jumpstack[cpu]); - } + for_each_possible_cpu(cpu) + kvfree(info->jumpstack[cpu]); + kvfree(info->jumpstack); } - if (sizeof(void **) * nr_cpu_ids > PAGE_SIZE) - vfree(info->jumpstack); - else - kfree(info->jumpstack); - free_percpu(info->stackptr); kfree(info); diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index 5ab24843370a..c13b79440ede 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -27,806 +27,6 @@ #include <linux/netfilter/xt_LOG.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/nf_log.h> -#include <net/netfilter/xt_log.h> - -static struct nf_loginfo default_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = 5, - .logflags = NF_LOG_MASK, - }, - }, -}; - -static int dump_udp_header(struct sbuff *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset) -{ - struct udphdr _udph; - const struct udphdr *uh; - - if (proto == IPPROTO_UDP) - /* Max length: 10 "PROTO=UDP " */ - sb_add(m, "PROTO=UDP "); - else /* Max length: 14 "PROTO=UDPLITE " */ - sb_add(m, "PROTO=UDPLITE "); - - if (fragment) - goto out; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); - if (uh == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); - - return 1; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - sb_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), - ntohs(uh->len)); - -out: - return 0; -} - -static int dump_tcp_header(struct sbuff *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset, - unsigned int logflags) -{ - struct tcphdr _tcph; - const struct tcphdr *th; - - /* Max length: 10 "PROTO=TCP " */ - sb_add(m, "PROTO=TCP "); - - if (fragment) - return 0; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); - if (th == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); - return 1; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - sb_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); - /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (logflags & XT_LOG_TCPSEQ) - sb_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); - - /* Max length: 13 "WINDOW=65535 " */ - sb_add(m, "WINDOW=%u ", ntohs(th->window)); - /* Max length: 9 "RES=0x3C " */ - sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & - TCP_RESERVED_BITS) >> 22)); - /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ - if (th->cwr) - sb_add(m, "CWR "); - if (th->ece) - sb_add(m, "ECE "); - if (th->urg) - sb_add(m, "URG "); - if (th->ack) - sb_add(m, "ACK "); - if (th->psh) - sb_add(m, "PSH "); - if (th->rst) - sb_add(m, "RST "); - if (th->syn) - sb_add(m, "SYN "); - if (th->fin) - sb_add(m, "FIN "); - /* Max length: 11 "URGP=65535 " */ - sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); - - if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { - u_int8_t _opt[60 - sizeof(struct tcphdr)]; - const u_int8_t *op; - unsigned int i; - unsigned int optsize = th->doff*4 - sizeof(struct tcphdr); - - op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), - optsize, _opt); - if (op == NULL) { - sb_add(m, "OPT (TRUNCATED)"); - return 1; - } - - /* Max length: 127 "OPT (" 15*4*2chars ") " */ - sb_add(m, "OPT ("); - for (i = 0; i < optsize; i++) - sb_add(m, "%02X", op[i]); - - sb_add(m, ") "); - } - - return 0; -} - -static void dump_sk_uid_gid(struct sbuff *m, struct sock *sk) -{ - if (!sk || sk->sk_state == TCP_TIME_WAIT) - return; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket && sk->sk_socket->file) { - const struct cred *cred = sk->sk_socket->file->f_cred; - sb_add(m, "UID=%u GID=%u ", - from_kuid_munged(&init_user_ns, cred->fsuid), - from_kgid_munged(&init_user_ns, cred->fsgid)); - } - read_unlock_bh(&sk->sk_callback_lock); -} - -/* One level of recursion won't kill us */ -static void dump_ipv4_packet(struct sbuff *m, - const struct nf_loginfo *info, - const struct sk_buff *skb, - unsigned int iphoff) -{ - struct iphdr _iph; - const struct iphdr *ih; - unsigned int logflags; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - else - logflags = NF_LOG_MASK; - - ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); - if (ih == NULL) { - sb_add(m, "TRUNCATED"); - return; - } - - /* Important fields: - * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ - /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ - sb_add(m, "SRC=%pI4 DST=%pI4 ", - &ih->saddr, &ih->daddr); - - /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ - sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", - ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, - ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); - - /* Max length: 6 "CE DF MF " */ - if (ntohs(ih->frag_off) & IP_CE) - sb_add(m, "CE "); - if (ntohs(ih->frag_off) & IP_DF) - sb_add(m, "DF "); - if (ntohs(ih->frag_off) & IP_MF) - sb_add(m, "MF "); - - /* Max length: 11 "FRAG:65535 " */ - if (ntohs(ih->frag_off) & IP_OFFSET) - sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - - if ((logflags & XT_LOG_IPOPT) && - ih->ihl * 4 > sizeof(struct iphdr)) { - const unsigned char *op; - unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; - unsigned int i, optsize; - - optsize = ih->ihl * 4 - sizeof(struct iphdr); - op = skb_header_pointer(skb, iphoff+sizeof(_iph), - optsize, _opt); - if (op == NULL) { - sb_add(m, "TRUNCATED"); - return; - } - - /* Max length: 127 "OPT (" 15*4*2chars ") " */ - sb_add(m, "OPT ("); - for (i = 0; i < optsize; i++) - sb_add(m, "%02X", op[i]); - sb_add(m, ") "); - } - - switch (ih->protocol) { - case IPPROTO_TCP: - if (dump_tcp_header(m, skb, ih->protocol, - ntohs(ih->frag_off) & IP_OFFSET, - iphoff+ih->ihl*4, logflags)) - return; - break; - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - if (dump_udp_header(m, skb, ih->protocol, - ntohs(ih->frag_off) & IP_OFFSET, - iphoff+ih->ihl*4)) - return; - break; - case IPPROTO_ICMP: { - struct icmphdr _icmph; - const struct icmphdr *ich; - static const size_t required_len[NR_ICMP_TYPES+1] - = { [ICMP_ECHOREPLY] = 4, - [ICMP_DEST_UNREACH] - = 8 + sizeof(struct iphdr), - [ICMP_SOURCE_QUENCH] - = 8 + sizeof(struct iphdr), - [ICMP_REDIRECT] - = 8 + sizeof(struct iphdr), - [ICMP_ECHO] = 4, - [ICMP_TIME_EXCEEDED] - = 8 + sizeof(struct iphdr), - [ICMP_PARAMETERPROB] - = 8 + sizeof(struct iphdr), - [ICMP_TIMESTAMP] = 20, - [ICMP_TIMESTAMPREPLY] = 20, - [ICMP_ADDRESS] = 12, - [ICMP_ADDRESSREPLY] = 12 }; - - /* Max length: 11 "PROTO=ICMP " */ - sb_add(m, "PROTO=ICMP "); - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, - sizeof(_icmph), &_icmph); - if (ich == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Max length: 18 "TYPE=255 CODE=255 " */ - sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (ich->type <= NR_ICMP_TYPES && - required_len[ich->type] && - skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { - sb_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - switch (ich->type) { - case ICMP_ECHOREPLY: - case ICMP_ECHO: - /* Max length: 19 "ID=65535 SEQ=65535 " */ - sb_add(m, "ID=%u SEQ=%u ", - ntohs(ich->un.echo.id), - ntohs(ich->un.echo.sequence)); - break; - - case ICMP_PARAMETERPROB: - /* Max length: 14 "PARAMETER=255 " */ - sb_add(m, "PARAMETER=%u ", - ntohl(ich->un.gateway) >> 24); - break; - case ICMP_REDIRECT: - /* Max length: 24 "GATEWAY=255.255.255.255 " */ - sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); - /* Fall through */ - case ICMP_DEST_UNREACH: - case ICMP_SOURCE_QUENCH: - case ICMP_TIME_EXCEEDED: - /* Max length: 3+maxlen */ - if (!iphoff) { /* Only recurse once. */ - sb_add(m, "["); - dump_ipv4_packet(m, info, skb, - iphoff + ih->ihl*4+sizeof(_icmph)); - sb_add(m, "] "); - } - - /* Max length: 10 "MTU=65535 " */ - if (ich->type == ICMP_DEST_UNREACH && - ich->code == ICMP_FRAG_NEEDED) - sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); - } - break; - } - /* Max Length */ - case IPPROTO_AH: { - struct ip_auth_hdr _ahdr; - const struct ip_auth_hdr *ah; - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 9 "PROTO=AH " */ - sb_add(m, "PROTO=AH "); - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ah = skb_header_pointer(skb, iphoff+ih->ihl*4, - sizeof(_ahdr), &_ahdr); - if (ah == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Length: 15 "SPI=0xF1234567 " */ - sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); - break; - } - case IPPROTO_ESP: { - struct ip_esp_hdr _esph; - const struct ip_esp_hdr *eh; - - /* Max length: 10 "PROTO=ESP " */ - sb_add(m, "PROTO=ESP "); - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - eh = skb_header_pointer(skb, iphoff+ih->ihl*4, - sizeof(_esph), &_esph); - if (eh == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Length: 15 "SPI=0xF1234567 " */ - sb_add(m, "SPI=0x%x ", ntohl(eh->spi)); - break; - } - /* Max length: 10 "PROTO 255 " */ - default: - sb_add(m, "PROTO=%u ", ih->protocol); - } - - /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && !iphoff) - dump_sk_uid_gid(m, skb->sk); - - /* Max length: 16 "MARK=0xFFFFFFFF " */ - if (!iphoff && skb->mark) - sb_add(m, "MARK=0x%x ", skb->mark); - - /* Proto Max log string length */ - /* IP: 40+46+6+11+127 = 230 */ - /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ - /* UDP: 10+max(25,20) = 35 */ - /* UDPLITE: 14+max(25,20) = 39 */ - /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ - /* ESP: 10+max(25)+15 = 50 */ - /* AH: 9+max(25)+15 = 49 */ - /* unknown: 10 */ - - /* (ICMP allows recursion one level deep) */ - /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ - /* maxlen = 230+ 91 + 230 + 252 = 803 */ -} - -static void dump_ipv4_mac_header(struct sbuff *m, - const struct nf_loginfo *info, - const struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - unsigned int logflags = 0; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - - if (!(logflags & XT_LOG_MACDECODE)) - goto fallback; - - switch (dev->type) { - case ARPHRD_ETHER: - sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, - ntohs(eth_hdr(skb)->h_proto)); - return; - default: - break; - } - -fallback: - sb_add(m, "MAC="); - if (dev->hard_header_len && - skb->mac_header != skb->network_header) { - const unsigned char *p = skb_mac_header(skb); - unsigned int i; - - sb_add(m, "%02x", *p++); - for (i = 1; i < dev->hard_header_len; i++, p++) - sb_add(m, ":%02x", *p); - } - sb_add(m, " "); -} - -static void -log_packet_common(struct sbuff *m, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - sb_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", - '0' + loginfo->u.log.level, prefix, - in ? in->name : "", - out ? out->name : ""); -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - const struct net_device *physindev; - const struct net_device *physoutdev; - - physindev = skb->nf_bridge->physindev; - if (physindev && in != physindev) - sb_add(m, "PHYSIN=%s ", physindev->name); - physoutdev = skb->nf_bridge->physoutdev; - if (physoutdev && out != physoutdev) - sb_add(m, "PHYSOUT=%s ", physoutdev->name); - } -#endif -} - - -static void -ipt_log_packet(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - struct sbuff *m; - - /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net)) - return; - - m = sb_open(); - - if (!loginfo) - loginfo = &default_loginfo; - - log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); - - if (in != NULL) - dump_ipv4_mac_header(m, loginfo, skb); - - dump_ipv4_packet(m, loginfo, skb, 0); - - sb_close(m); -} - -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) -/* One level of recursion won't kill us */ -static void dump_ipv6_packet(struct sbuff *m, - const struct nf_loginfo *info, - const struct sk_buff *skb, unsigned int ip6hoff, - int recurse) -{ - u_int8_t currenthdr; - int fragment; - struct ipv6hdr _ip6h; - const struct ipv6hdr *ih; - unsigned int ptr; - unsigned int hdrlen = 0; - unsigned int logflags; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - else - logflags = NF_LOG_MASK; - - ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); - if (ih == NULL) { - sb_add(m, "TRUNCATED"); - return; - } - - /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ - sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); - - /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ - sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", - ntohs(ih->payload_len) + sizeof(struct ipv6hdr), - (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, - ih->hop_limit, - (ntohl(*(__be32 *)ih) & 0x000fffff)); - - fragment = 0; - ptr = ip6hoff + sizeof(struct ipv6hdr); - currenthdr = ih->nexthdr; - while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { - struct ipv6_opt_hdr _hdr; - const struct ipv6_opt_hdr *hp; - - hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); - if (hp == NULL) { - sb_add(m, "TRUNCATED"); - return; - } - - /* Max length: 48 "OPT (...) " */ - if (logflags & XT_LOG_IPOPT) - sb_add(m, "OPT ( "); - - switch (currenthdr) { - case IPPROTO_FRAGMENT: { - struct frag_hdr _fhdr; - const struct frag_hdr *fh; - - sb_add(m, "FRAG:"); - fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), - &_fhdr); - if (fh == NULL) { - sb_add(m, "TRUNCATED "); - return; - } - - /* Max length: 6 "65535 " */ - sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); - - /* Max length: 11 "INCOMPLETE " */ - if (fh->frag_off & htons(0x0001)) - sb_add(m, "INCOMPLETE "); - - sb_add(m, "ID:%08x ", ntohl(fh->identification)); - - if (ntohs(fh->frag_off) & 0xFFF8) - fragment = 1; - - hdrlen = 8; - - break; - } - case IPPROTO_DSTOPTS: - case IPPROTO_ROUTING: - case IPPROTO_HOPOPTS: - if (fragment) { - if (logflags & XT_LOG_IPOPT) - sb_add(m, ")"); - return; - } - hdrlen = ipv6_optlen(hp); - break; - /* Max Length */ - case IPPROTO_AH: - if (logflags & XT_LOG_IPOPT) { - struct ip_auth_hdr _ahdr; - const struct ip_auth_hdr *ah; - - /* Max length: 3 "AH " */ - sb_add(m, "AH "); - - if (fragment) { - sb_add(m, ")"); - return; - } - - ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), - &_ahdr); - if (ah == NULL) { - /* - * Max length: 26 "INCOMPLETE [65535 - * bytes] )" - */ - sb_add(m, "INCOMPLETE [%u bytes] )", - skb->len - ptr); - return; - } - - /* Length: 15 "SPI=0xF1234567 */ - sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); - - } - - hdrlen = (hp->hdrlen+2)<<2; - break; - case IPPROTO_ESP: - if (logflags & XT_LOG_IPOPT) { - struct ip_esp_hdr _esph; - const struct ip_esp_hdr *eh; - - /* Max length: 4 "ESP " */ - sb_add(m, "ESP "); - - if (fragment) { - sb_add(m, ")"); - return; - } - - /* - * Max length: 26 "INCOMPLETE [65535 bytes] )" - */ - eh = skb_header_pointer(skb, ptr, sizeof(_esph), - &_esph); - if (eh == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] )", - skb->len - ptr); - return; - } - - /* Length: 16 "SPI=0xF1234567 )" */ - sb_add(m, "SPI=0x%x )", ntohl(eh->spi)); - - } - return; - default: - /* Max length: 20 "Unknown Ext Hdr 255" */ - sb_add(m, "Unknown Ext Hdr %u", currenthdr); - return; - } - if (logflags & XT_LOG_IPOPT) - sb_add(m, ") "); - - currenthdr = hp->nexthdr; - ptr += hdrlen; - } - - switch (currenthdr) { - case IPPROTO_TCP: - if (dump_tcp_header(m, skb, currenthdr, fragment, ptr, - logflags)) - return; - break; - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - if (dump_udp_header(m, skb, currenthdr, fragment, ptr)) - return; - break; - case IPPROTO_ICMPV6: { - struct icmp6hdr _icmp6h; - const struct icmp6hdr *ic; - - /* Max length: 13 "PROTO=ICMPv6 " */ - sb_add(m, "PROTO=ICMPv6 "); - - if (fragment) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); - if (ic == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); - return; - } - - /* Max length: 18 "TYPE=255 CODE=255 " */ - sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); - - switch (ic->icmp6_type) { - case ICMPV6_ECHO_REQUEST: - case ICMPV6_ECHO_REPLY: - /* Max length: 19 "ID=65535 SEQ=65535 " */ - sb_add(m, "ID=%u SEQ=%u ", - ntohs(ic->icmp6_identifier), - ntohs(ic->icmp6_sequence)); - break; - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - break; - - case ICMPV6_PARAMPROB: - /* Max length: 17 "POINTER=ffffffff " */ - sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer)); - /* Fall through */ - case ICMPV6_DEST_UNREACH: - case ICMPV6_PKT_TOOBIG: - case ICMPV6_TIME_EXCEED: - /* Max length: 3+maxlen */ - if (recurse) { - sb_add(m, "["); - dump_ipv6_packet(m, info, skb, - ptr + sizeof(_icmp6h), 0); - sb_add(m, "] "); - } - - /* Max length: 10 "MTU=65535 " */ - if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) - sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu)); - } - break; - } - /* Max length: 10 "PROTO=255 " */ - default: - sb_add(m, "PROTO=%u ", currenthdr); - } - - /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && recurse) - dump_sk_uid_gid(m, skb->sk); - - /* Max length: 16 "MARK=0xFFFFFFFF " */ - if (recurse && skb->mark) - sb_add(m, "MARK=0x%x ", skb->mark); -} - -static void dump_ipv6_mac_header(struct sbuff *m, - const struct nf_loginfo *info, - const struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - unsigned int logflags = 0; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - - if (!(logflags & XT_LOG_MACDECODE)) - goto fallback; - - switch (dev->type) { - case ARPHRD_ETHER: - sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, - ntohs(eth_hdr(skb)->h_proto)); - return; - default: - break; - } - -fallback: - sb_add(m, "MAC="); - if (dev->hard_header_len && - skb->mac_header != skb->network_header) { - const unsigned char *p = skb_mac_header(skb); - unsigned int len = dev->hard_header_len; - unsigned int i; - - if (dev->type == ARPHRD_SIT) { - p -= ETH_HLEN; - - if (p < skb->head) - p = NULL; - } - - if (p != NULL) { - sb_add(m, "%02x", *p++); - for (i = 1; i < len; i++) - sb_add(m, ":%02x", *p++); - } - sb_add(m, " "); - - if (dev->type == ARPHRD_SIT) { - const struct iphdr *iph = - (struct iphdr *)skb_mac_header(skb); - sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, - &iph->daddr); - } - } else - sb_add(m, " "); -} - -static void -ip6t_log_packet(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - struct sbuff *m; - - /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net)) - return; - - m = sb_open(); - - if (!loginfo) - loginfo = &default_loginfo; - - log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); - - if (in != NULL) - dump_ipv6_mac_header(m, loginfo, skb); - - dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); - - sb_close(m); -} -#endif static unsigned int log_tg(struct sk_buff *skb, const struct xt_action_param *par) @@ -839,17 +39,8 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; - if (par->family == NFPROTO_IPV4) - ipt_log_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in, - par->out, &li, loginfo->prefix); -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - else if (par->family == NFPROTO_IPV6) - ip6t_log_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in, - par->out, &li, loginfo->prefix); -#endif - else - WARN_ON_ONCE(1); - + nf_log_packet(net, par->family, par->hooknum, skb, par->in, par->out, + &li, "%s", loginfo->prefix); return XT_CONTINUE; } @@ -870,7 +61,12 @@ static int log_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } - return 0; + return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); +} + +static void log_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_logger_put(par->family, NF_LOG_TYPE_LOG); } static struct xt_target log_tg_regs[] __read_mostly = { @@ -880,6 +76,7 @@ static struct xt_target log_tg_regs[] __read_mostly = { .target = log_tg, .targetsize = sizeof(struct xt_log_info), .checkentry = log_tg_check, + .destroy = log_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) @@ -889,78 +86,19 @@ static struct xt_target log_tg_regs[] __read_mostly = { .target = log_tg, .targetsize = sizeof(struct xt_log_info), .checkentry = log_tg_check, + .destroy = log_tg_destroy, .me = THIS_MODULE, }, #endif }; -static struct nf_logger ipt_log_logger __read_mostly = { - .name = "ipt_LOG", - .logfn = &ipt_log_packet, - .me = THIS_MODULE, -}; - -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) -static struct nf_logger ip6t_log_logger __read_mostly = { - .name = "ip6t_LOG", - .logfn = &ip6t_log_packet, - .me = THIS_MODULE, -}; -#endif - -static int __net_init log_net_init(struct net *net) -{ - nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger); -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger); -#endif - return 0; -} - -static void __net_exit log_net_exit(struct net *net) -{ - nf_log_unset(net, &ipt_log_logger); -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - nf_log_unset(net, &ip6t_log_logger); -#endif -} - -static struct pernet_operations log_net_ops = { - .init = log_net_init, - .exit = log_net_exit, -}; - static int __init log_tg_init(void) { - int ret; - - ret = register_pernet_subsys(&log_net_ops); - if (ret < 0) - goto err_pernet; - - ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); - if (ret < 0) - goto err_target; - - nf_log_register(NFPROTO_IPV4, &ipt_log_logger); -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - nf_log_register(NFPROTO_IPV6, &ip6t_log_logger); -#endif - return 0; - -err_target: - unregister_pernet_subsys(&log_net_ops); -err_pernet: - return ret; + return xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); } static void __exit log_tg_exit(void) { - unregister_pernet_subsys(&log_net_ops); - nf_log_unregister(&ipt_log_logger); -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - nf_log_unregister(&ip6t_log_logger); -#endif xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); } diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 3045a964f39c..fe9415e5f91d 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -170,7 +170,6 @@ int netlbl_cfg_unlbl_map_add(const char *domain, #endif /* IPv6 */ default: goto cfg_unlbl_map_add_failure; - break; } entry->def.addrsel = addrmap; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 15c731f03fa6..1b38f7fe12f1 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -170,7 +170,6 @@ EXPORT_SYMBOL_GPL(netlink_remove_tap); static bool netlink_filter_tap(const struct sk_buff *skb) { struct sock *sk = skb->sk; - bool pass = false; /* We take the more conservative approach and * whitelist socket protocols that may pass. @@ -184,11 +183,10 @@ static bool netlink_filter_tap(const struct sk_buff *skb) case NETLINK_FIB_LOOKUP: case NETLINK_NETFILTER: case NETLINK_GENERIC: - pass = true; - break; + return true; } - return pass; + return false; } static int __netlink_deliver_tap_skb(struct sk_buff *skb, @@ -636,7 +634,7 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock, while (nlk->cb_running && netlink_dump_space(nlk)) { err = netlink_dump(sk); if (err < 0) { - sk->sk_err = err; + sk->sk_err = -err; sk->sk_error_report(sk); break; } @@ -1961,25 +1959,25 @@ struct netlink_broadcast_data { void *tx_data; }; -static int do_one_broadcast(struct sock *sk, - struct netlink_broadcast_data *p) +static void do_one_broadcast(struct sock *sk, + struct netlink_broadcast_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int val; if (p->exclude_sk == sk) - goto out; + return; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) - goto out; + return; if (!net_eq(sock_net(sk), p->net)) - goto out; + return; if (p->failure) { netlink_overrun(sk); - goto out; + return; } sock_hold(sk); @@ -2017,9 +2015,6 @@ static int do_one_broadcast(struct sock *sk, p->skb2 = NULL; } sock_put(sk); - -out: - return 0; } int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, @@ -2483,7 +2478,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { - sk->sk_err = ret; + sk->sk_err = -ret; sk->sk_error_report(sk); } } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index ede50d197e10..71cf1bffea06 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1418,7 +1418,7 @@ static int __init nr_proto_init(void) struct net_device *dev; sprintf(name, "nr%d", i); - dev = alloc_netdev(0, name, nr_setup); + dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, nr_setup); if (!dev) { printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n"); goto fail; diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index 171cb9949ab5..37deb173c956 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -457,12 +457,10 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, pr_err("Received a ACK/NACK PDU\n"); rc = -EINVAL; goto exit; - break; case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU: pr_err("Received a SUPERVISOR PDU\n"); rc = -EINVAL; goto exit; - break; } skb_pull(resp, size); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c36856a457ca..fe5cda0deb39 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -38,7 +38,7 @@ #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr, int len, bool keep_skb); + const struct nlattr *attr, int len); static int make_writable(struct sk_buff *skb, int write_len) { @@ -434,11 +434,17 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, return ovs_dp_upcall(dp, skb, &upcall); } +static bool last_action(const struct nlattr *a, int rem) +{ + return a->nla_len == rem; +} + static int sample(struct datapath *dp, struct sk_buff *skb, const struct nlattr *attr) { const struct nlattr *acts_list = NULL; const struct nlattr *a; + struct sk_buff *sample_skb; int rem; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; @@ -455,8 +461,34 @@ static int sample(struct datapath *dp, struct sk_buff *skb, } } - return do_execute_actions(dp, skb, nla_data(acts_list), - nla_len(acts_list), true); + rem = nla_len(acts_list); + a = nla_data(acts_list); + + /* Actions list is either empty or only contains a single user-space + * action, the latter being a special case as it is the only known + * usage of the sample action. + * In these special cases don't clone the skb as there are no + * side-effects in the nested actions. + * Otherwise, clone in case the nested actions have side effects. + */ + if (likely(rem == 0 || (nla_type(a) == OVS_ACTION_ATTR_USERSPACE && + last_action(a, rem)))) { + sample_skb = skb; + skb_get(skb); + } else { + sample_skb = skb_clone(skb, GFP_ATOMIC); + if (!sample_skb) /* Skip sample action when out of memory. */ + return 0; + } + + /* Note that do_execute_actions() never consumes skb. + * In the case where skb has been cloned above it is the clone that + * is consumed. Otherwise the skb_get(skb) call prevents + * consumption by do_execute_actions(). Thus, it is safe to simply + * return the error code and let the caller (also + * do_execute_actions()) free skb on error. + */ + return do_execute_actions(dp, sample_skb, a, rem); } static int execute_set_action(struct sk_buff *skb, @@ -507,7 +539,7 @@ static int execute_set_action(struct sk_buff *skb, /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr, int len, bool keep_skb) + const struct nlattr *attr, int len) { /* Every output action needs a separate clone of 'skb', but the common * case is just a single output action, so that doing a clone and @@ -551,6 +583,8 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, a); + if (unlikely(err)) /* skb already freed. */ + return err; break; } @@ -560,12 +594,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } } - if (prev_port != -1) { - if (keep_skb) - skb = skb_clone(skb, GFP_ATOMIC); - + if (prev_port != -1) do_output(dp, skb, prev_port); - } else if (!keep_skb) + else consume_skb(skb); return 0; @@ -577,6 +608,5 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); OVS_CB(skb)->tun_key = NULL; - return do_execute_actions(dp, skb, acts->actions, - acts->actions_len, false); + return do_execute_actions(dp, skb, acts->actions, acts->actions_len); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 0d407bca81e3..3c461e1e4554 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -66,16 +66,16 @@ static struct genl_family dp_packet_genl_family; static struct genl_family dp_flow_genl_family; static struct genl_family dp_datapath_genl_family; -static struct genl_multicast_group ovs_dp_flow_multicast_group = { - .name = OVS_FLOW_MCGROUP +static const struct genl_multicast_group ovs_dp_flow_multicast_group = { + .name = OVS_FLOW_MCGROUP, }; -static struct genl_multicast_group ovs_dp_datapath_multicast_group = { - .name = OVS_DATAPATH_MCGROUP +static const struct genl_multicast_group ovs_dp_datapath_multicast_group = { + .name = OVS_DATAPATH_MCGROUP, }; -struct genl_multicast_group ovs_dp_vport_multicast_group = { - .name = OVS_VPORT_MCGROUP +static const struct genl_multicast_group ovs_dp_vport_multicast_group = { + .name = OVS_VPORT_MCGROUP, }; /* Check if need to build a reply message. @@ -266,7 +266,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) upcall.cmd = OVS_PACKET_CMD_MISS; upcall.key = &key; upcall.userdata = NULL; - upcall.portid = p->upcall_portid; + upcall.portid = ovs_vport_find_upcall_portid(p, skb); ovs_dp_upcall(dp, skb, &upcall); consume_skb(skb); stats_counter = &stats->n_missed; @@ -276,7 +276,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) OVS_CB(skb)->flow = flow; OVS_CB(skb)->pkt_key = &key; - ovs_flow_stats_update(OVS_CB(skb)->flow, skb); + ovs_flow_stats_update(OVS_CB(skb)->flow, key.tp.flags, skb); ovs_execute_actions(dp, skb); stats_counter = &stats->n_hit; @@ -464,7 +464,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, upcall->dp_ifindex = dp_ifindex; nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); - ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb); + err = ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb); + BUG_ON(err); nla_nest_end(user_skb, nla); if (upcall_info->userdata) @@ -889,8 +890,11 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* The unmasked key has to be the same for flow updates. */ if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) { - error = -EEXIST; - goto err_unlock_ovs; + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (!flow) { + error = -ENOENT; + goto err_unlock_ovs; + } } /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); @@ -981,16 +985,12 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) goto err_unlock_ovs; } /* Check that the flow exists. */ - flow = ovs_flow_tbl_lookup(&dp->table, &key); + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (unlikely(!flow)) { error = -ENOENT; goto err_unlock_ovs; } - /* The unmasked key has to be the same for flow updates. */ - if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) { - error = -EEXIST; - goto err_unlock_ovs; - } + /* Update actions, if present. */ if (likely(acts)) { old_acts = ovsl_dereference(flow->sf_acts); @@ -1063,8 +1063,8 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) goto unlock; } - flow = ovs_flow_tbl_lookup(&dp->table, &key); - if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) { + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (!flow) { err = -ENOENT; goto unlock; } @@ -1113,8 +1113,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) goto unlock; } - flow = ovs_flow_tbl_lookup(&dp->table, &key); - if (unlikely(!flow || !ovs_flow_cmp_unmasked_key(flow, &match))) { + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { err = -ENOENT; goto unlock; } @@ -1190,7 +1190,7 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, }; -static struct genl_ops dp_flow_genl_ops[] = { +static const struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, @@ -1374,7 +1374,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.options = NULL; parms.dp = dp; parms.port_no = OVSP_LOCAL; - parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]); + parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; ovs_dp_change(dp, a); @@ -1578,7 +1578,7 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, }; -static struct genl_ops dp_datapath_genl_ops[] = { +static const struct genl_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, @@ -1633,8 +1633,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || - nla_put_string(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport)) || - nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_portid)) + nla_put_string(skb, OVS_VPORT_ATTR_NAME, + vport->ops->get_name(vport))) goto nla_put_failure; ovs_vport_get_stats(vport, &vport_stats); @@ -1642,6 +1642,9 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, &vport_stats)) goto nla_put_failure; + if (ovs_vport_get_upcall_portids(vport, skb)) + goto nla_put_failure; + err = ovs_vport_get_options(vport, skb); if (err == -EMSGSIZE) goto error; @@ -1763,7 +1766,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.options = a[OVS_VPORT_ATTR_OPTIONS]; parms.dp = dp; parms.port_no = port_no; - parms.upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; vport = new_vport(&parms); err = PTR_ERR(vport); @@ -1813,8 +1816,14 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) goto exit_unlock_free; } - if (a[OVS_VPORT_ATTR_UPCALL_PID]) - vport->upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + + if (a[OVS_VPORT_ATTR_UPCALL_PID]) { + struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID]; + + err = ovs_vport_set_upcall_portids(vport, ids); + if (err) + goto exit_unlock_free; + } err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); @@ -1945,7 +1954,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, }; -static struct genl_ops dp_vport_genl_ops[] = { +static const struct genl_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, @@ -2054,10 +2063,14 @@ static int __init dp_init(void) pr_info("Open vSwitch switching datapath\n"); - err = ovs_flow_init(); + err = ovs_internal_dev_rtnl_link_register(); if (err) goto error; + err = ovs_flow_init(); + if (err) + goto error_unreg_rtnl_link; + err = ovs_vport_init(); if (err) goto error_flow_exit; @@ -2084,6 +2097,8 @@ error_vport_exit: ovs_vport_exit(); error_flow_exit: ovs_flow_exit(); +error_unreg_rtnl_link: + ovs_internal_dev_rtnl_link_unregister(); error: return err; } @@ -2096,6 +2111,7 @@ static void dp_cleanup(void) rcu_barrier(); ovs_vport_exit(); ovs_flow_exit(); + ovs_internal_dev_rtnl_link_unregister(); } module_init(dp_init); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 334751cb1528..d07ab538fc9d 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -61,10 +61,10 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) #define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) -void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb) +void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, + struct sk_buff *skb) { struct flow_stats *stats; - __be16 tcp_flags = flow->key.tp.flags; int node = numa_node_id(); stats = rcu_dereference(flow->stats[node]); diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index ac395d2cd821..5e5aaed3a85b 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -180,7 +180,8 @@ struct arp_eth_header { unsigned char ar_tip[4]; /* target IP address */ } __packed; -void ovs_flow_stats_update(struct sw_flow *, struct sk_buff *); +void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, + struct sk_buff *); void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, unsigned long *used, __be16 *tcp_flags); void ovs_flow_stats_clear(struct sw_flow *); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 574c3abc9b30..cf2d853646f0 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -456,6 +456,22 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit); } +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + struct sw_flow_match *match) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow_mask *mask; + struct sw_flow *flow; + + /* Always called under ovs-mutex. */ + list_for_each_entry(mask, &tbl->mask_list, list) { + flow = masked_flow_lookup(ti, match->key, mask); + if (flow && ovs_flow_cmp_unmasked_key(flow, match)) /* Found */ + return flow; + } + return NULL; +} + int ovs_flow_tbl_num_masks(const struct flow_table *table) { struct sw_flow_mask *mask; diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index ca8a5820f615..5918bff7f3f6 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -76,7 +76,8 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, u32 *n_mask_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); - +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + struct sw_flow_match *match); bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, struct sw_flow_match *match); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 35ec4fed09e2..f49148a07da2 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -110,6 +110,22 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_RCVD; } +/* Called with rcu_read_lock and BH disabled. */ +static int gre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) +{ + struct ovs_net *ovs_net; + struct vport *vport; + + ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); + vport = rcu_dereference(ovs_net->vport_net.gre_vport); + + if (unlikely(!vport)) + return PACKET_REJECT; + else + return PACKET_RCVD; +} + static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); @@ -186,6 +202,7 @@ error: static struct gre_cisco_protocol gre_protocol = { .handler = gre_rcv, + .err_handler = gre_err, .priority = 1, }; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 789af9280e77..84516126e5f3 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -26,6 +26,7 @@ #include <net/dst.h> #include <net/xfrm.h> +#include <net/rtnetlink.h> #include "datapath.h" #include "vport-internal_dev.h" @@ -121,6 +122,10 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_get_stats64 = internal_dev_get_stats, }; +static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { + .kind = "openvswitch", +}; + static void do_setup(struct net_device *netdev) { ether_setup(netdev); @@ -131,14 +136,18 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; + netdev->rtnl_link_ops = &internal_dev_link_ops; netdev->tx_queue_len = 0; netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | - NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; + NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | + NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL; netdev->vlan_features = netdev->features; + netdev->hw_enc_features = netdev->features; netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; netdev->hw_features = netdev->features & ~NETIF_F_LLTX; + eth_hw_addr_random(netdev); } @@ -159,7 +168,8 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) netdev_vport = netdev_vport_priv(vport); netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, do_setup); + parms->name, NET_NAME_UNKNOWN, + do_setup); if (!netdev_vport->dev) { err = -ENOMEM; goto error_free_vport; @@ -248,3 +258,13 @@ struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) return internal_dev_priv(netdev)->vport; } + +int ovs_internal_dev_rtnl_link_register(void) +{ + return rtnl_link_register(&internal_dev_link_ops); +} + +void ovs_internal_dev_rtnl_link_unregister(void) +{ + rtnl_link_unregister(&internal_dev_link_ops); +} diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h index 9a7d30ecc6a2..1b179a190cff 100644 --- a/net/openvswitch/vport-internal_dev.h +++ b/net/openvswitch/vport-internal_dev.h @@ -24,5 +24,7 @@ int ovs_is_internal_dev(const struct net_device *); struct vport *ovs_internal_dev_get_vport(struct net_device *); +int ovs_internal_dev_rtnl_link_register(void); +void ovs_internal_dev_rtnl_link_unregister(void); #endif /* vport-internal_dev.h */ diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 0edbd95c60e7..d8b7e247bebf 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -143,8 +143,6 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) struct rtable *rt; struct flowi4 fl; __be16 src_port; - int port_min; - int port_max; __be16 df; int err; @@ -172,8 +170,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) skb->ignore_df = 1; - inet_get_local_port_range(net, &port_min, &port_max); - src_port = vxlan_src_port(port_min, port_max, skb); + src_port = udp_flow_src_port(net, skb, 0, 0, true); err = vxlan_xmit_skb(vxlan_port->vs, rt, skb, fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 42c0f4a0b78c..702fb21bfe15 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -134,10 +134,12 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, vport->dp = parms->dp; vport->port_no = parms->port_no; - vport->upcall_portid = parms->upcall_portid; vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); + if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) + return ERR_PTR(-EINVAL); + vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vport->percpu_stats) { kfree(vport); @@ -161,6 +163,10 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, */ void ovs_vport_free(struct vport *vport) { + /* vport is freed from RCU callback or error path, Therefore + * it is safe to use raw dereference. + */ + kfree(rcu_dereference_raw(vport->upcall_portids)); free_percpu(vport->percpu_stats); kfree(vport); } @@ -327,6 +333,99 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) } /** + * ovs_vport_set_upcall_portids - set upcall portids of @vport. + * + * @vport: vport to modify. + * @ids: new configuration, an array of port ids. + * + * Sets the vport's upcall_portids to @ids. + * + * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed + * as an array of U32. + * + * Must be called with ovs_mutex. + */ +int ovs_vport_set_upcall_portids(struct vport *vport, struct nlattr *ids) +{ + struct vport_portids *old, *vport_portids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; + + old = ovsl_dereference(vport->upcall_portids); + + vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), + GFP_KERNEL); + if (!vport_portids) + return -ENOMEM; + + vport_portids->n_ids = nla_len(ids) / sizeof(u32); + vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids); + nla_memcpy(vport_portids->ids, ids, nla_len(ids)); + + rcu_assign_pointer(vport->upcall_portids, vport_portids); + + if (old) + kfree_rcu(old, rcu); + return 0; +} + +/** + * ovs_vport_get_upcall_portids - get the upcall_portids of @vport. + * + * @vport: vport from which to retrieve the portids. + * @skb: sk_buff where portids should be appended. + * + * Retrieves the configuration of the given vport, appending the + * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall + * portids to @skb. + * + * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room. + * If an error occurs, @skb is left unmodified. Must be called with + * ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_upcall_portids(const struct vport *vport, + struct sk_buff *skb) +{ + struct vport_portids *ids; + + ids = rcu_dereference_ovsl(vport->upcall_portids); + + if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) + return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, + ids->n_ids * sizeof(u32), (void *)ids->ids); + else + return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); +} + +/** + * ovs_vport_find_upcall_portid - find the upcall portid to send upcall. + * + * @vport: vport from which the missed packet is received. + * @skb: skb that the missed packet was received. + * + * Uses the skb_get_hash() to select the upcall portid to send the + * upcall. + * + * Returns the portid of the target socket. Must be called with rcu_read_lock. + */ +u32 ovs_vport_find_upcall_portid(const struct vport *p, struct sk_buff *skb) +{ + struct vport_portids *ids; + u32 ids_index; + u32 hash; + + ids = rcu_dereference(p->upcall_portids); + + if (ids->n_ids == 1 && ids->ids[0] == 0) + return 0; + + hash = skb_get_hash(skb); + ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); + return ids->ids[ids_index]; +} + +/** * ovs_vport_receive - pass up received packet to the datapath for processing * * @vport: vport that received the packet diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 8d721e62f388..35f89d84b45e 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -23,6 +23,7 @@ #include <linux/list.h> #include <linux/netlink.h> #include <linux/openvswitch.h> +#include <linux/reciprocal_div.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/u64_stats_sync.h> @@ -52,6 +53,10 @@ void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); int ovs_vport_set_options(struct vport *, struct nlattr *options); int ovs_vport_get_options(const struct vport *, struct sk_buff *); +int ovs_vport_set_upcall_portids(struct vport *, struct nlattr *pids); +int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); +u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); + int ovs_vport_send(struct vport *, struct sk_buff *); /* The following definitions are for implementers of vport devices: */ @@ -62,13 +67,27 @@ struct vport_err_stats { u64 tx_dropped; u64 tx_errors; }; +/** + * struct vport_portids - array of netlink portids of a vport. + * must be protected by rcu. + * @rn_ids: The reciprocal value of @n_ids. + * @rcu: RCU callback head for deferred destruction. + * @n_ids: Size of @ids array. + * @ids: Array storing the Netlink socket pids to be used for packets received + * on this port that miss the flow table. + */ +struct vport_portids { + struct reciprocal_value rn_ids; + struct rcu_head rcu; + u32 n_ids; + u32 ids[]; +}; /** * struct vport - one port within a datapath * @rcu: RCU callback head for deferred destruction. * @dp: Datapath to which this port belongs. - * @upcall_portid: The Netlink port to use for packets received on this port that - * miss the flow table. + * @upcall_portids: RCU protected 'struct vport_portids'. * @port_no: Index into @dp's @ports array. * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. @@ -80,7 +99,7 @@ struct vport_err_stats { struct vport { struct rcu_head rcu; struct datapath *dp; - u32 upcall_portid; + struct vport_portids __rcu *upcall_portids; u16 port_no; struct hlist_node hash_node; @@ -111,7 +130,7 @@ struct vport_parms { /* For ovs_vport_alloc(). */ struct datapath *dp; u16 port_no; - u32 upcall_portid; + struct nlattr *upcall_portids; }; /** diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b85c67ccb797..614ca91f785a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3071,10 +3071,8 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, break; case PACKET_MR_PROMISC: return dev_set_promiscuity(dev, what); - break; case PACKET_MR_ALLMULTI: return dev_set_allmulti(dev, what); - break; case PACKET_MR_UNICAST: if (i->alen != dev->addr_len) return -EINVAL; diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c index 66dc65e7c6a1..e9a83a637185 100644 --- a/net/phonet/pep-gprs.c +++ b/net/phonet/pep-gprs.c @@ -267,7 +267,7 @@ int gprs_attach(struct sock *sk) return -EINVAL; /* need packet boundaries */ /* Create net device */ - dev = alloc_netdev(sizeof(*gp), ifname, gprs_setup); + dev = alloc_netdev(sizeof(*gp), ifname, NET_NAME_UNKNOWN, gprs_setup); if (!dev) return -ENOMEM; gp = netdev_priv(dev); diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 8451c8cdc9de..a85c1a086ae4 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1538,7 +1538,7 @@ static int __init rose_proto_init(void) char name[IFNAMSIZ]; sprintf(name, "rose%d", i); - dev = alloc_netdev(0, name, rose_setup); + dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, rose_setup); if (!dev) { printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n"); rc = -ENOMEM; diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 0ad080790a32..eec998efd020 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -346,7 +346,7 @@ static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td, n_elem = ntohl(*xdr++); toklen -= 4; - if (n_elem < 0 || n_elem > max_n_elem) + if (n_elem > max_n_elem) return -EINVAL; *_n_elem = n_elem; if (n_elem > 0) { diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 4f912c0e225b..eb48306033d9 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -218,10 +218,12 @@ static int mirred_device_event(struct notifier_block *unused, if (event == NETDEV_UNREGISTER) list_for_each_entry(m, &mirred_list, tcfm_list) { + spin_lock_bh(&m->tcf_lock); if (m->tcfm_dev == dev) { dev_put(dev); m->tcfm_dev = NULL; } + spin_unlock_bh(&m->tcf_lock); } return NOTIFY_DONE; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 45527e6b52db..c28b0d327b12 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -561,13 +561,14 @@ EXPORT_SYMBOL(tcf_exts_change); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT + struct nlattr *nest; + if (exts->action && !list_empty(&exts->actions)) { /* * again for backward compatible mode - we want * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ - struct nlattr *nest; if (exts->type != TCA_OLD_COMPAT) { nest = nla_nest_start(skb, exts->action); if (nest == NULL) @@ -585,10 +586,14 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) nla_nest_end(skb, nest); } } -#endif return 0; -nla_put_failure: __attribute__ ((unused)) + +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; +#else + return 0; +#endif } EXPORT_SYMBOL(tcf_exts_dump); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index c721cd4a469f..3e9f76413b3b 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -420,7 +420,7 @@ static void tcindex_destroy(struct tcf_proto *tp) pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); walker.count = 0; walker.skip = 0; - walker.fn = &tcindex_destroy_element; + walker.fn = tcindex_destroy_element; tcindex_walk(tp, &walker); kfree(p->perfect); kfree(p->h); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index c39b583ace32..70c0be8d0121 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -38,6 +38,7 @@ #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <linux/bitmap.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> @@ -460,17 +461,25 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg) return 0; } +#define NR_U32_NODE (1<<12) static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) { struct tc_u_knode *n; - unsigned int i = 0x7FF; + unsigned long i; + unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long), + GFP_KERNEL); + if (!bitmap) + return handle | 0xFFF; for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next) - if (i < TC_U32_NODE(n->handle)) - i = TC_U32_NODE(n->handle); - i++; + set_bit(TC_U32_NODE(n->handle), bitmap); - return handle | (i > 0xFFF ? 0xFFF : i); + i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); + if (i >= NR_U32_NODE) + i = find_next_zero_bit(bitmap, NR_U32_NODE, 1); + + kfree(bitmap); + return handle | (i >= NR_U32_NODE ? 0xFFF : i); } static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c index bfd34e4c1afc..7c292d474f47 100644 --- a/net/sched/em_canid.c +++ b/net/sched/em_canid.c @@ -125,7 +125,6 @@ static int em_canid_change(struct tcf_proto *tp, void *data, int len, { struct can_filter *conf = data; /* Array with rules */ struct canid_match *cm; - struct canid_match *cm_old = (struct canid_match *)m->data; int i; if (!len) @@ -181,12 +180,6 @@ static int em_canid_change(struct tcf_proto *tp, void *data, int len, m->datalen = sizeof(struct canid_match) + len; m->data = (unsigned long)cm; - - if (cm_old != NULL) { - pr_err("canid: Configuring an existing ematch!\n"); - kfree(cm_old); - } - return 0; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e1543b03e39d..fc04fe93c2da 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -108,7 +108,7 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, /* * Transmit one skb, and handle the return status as required. Holding the - * __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this + * __QDISC___STATE_RUNNING bit guarantees that only one CPU can execute this * function. * * Returns to the caller: @@ -156,7 +156,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, /* * NOTE: Called under qdisc_lock(q) with locally disabled BH. * - * __QDISC_STATE_RUNNING guarantees only one CPU can process + * __QDISC___STATE_RUNNING guarantees only one CPU can process * this qdisc at a time. qdisc_lock(q) serializes queue accesses for * this queue. * diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 474167162947..bd33793b527e 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -485,8 +485,8 @@ static int __init teql_init(void) struct net_device *dev; struct teql_master *master; - dev = alloc_netdev(sizeof(struct teql_master), - "teql%d", teql_master_setup); + dev = alloc_netdev(sizeof(struct teql_master), "teql%d", + NET_NAME_UNKNOWN, teql_master_setup); if (!dev) { err = -ENOMEM; break; diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 5c30b7a873df..3b4ffb021cf1 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ protocol.o endpointola.o associola.o \ transport.o chunk.o sm_make_chunk.o ulpevent.o \ - inqueue.o outqueue.o ulpqueue.o command.o \ + inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o ssnmap.o auth.o diff --git a/net/sctp/command.c b/net/sctp/command.c deleted file mode 100644 index dd7375851618..000000000000 --- a/net/sctp/command.c +++ /dev/null @@ -1,68 +0,0 @@ -/* SCTP kernel implementation Copyright (C) 1999-2001 - * Cisco, Motorola, and IBM - * Copyright 2001 La Monte H.P. Yarroll - * - * This file is part of the SCTP kernel implementation - * - * These functions manipulate sctp command sequences. - * - * This SCTP implementation is free software; - * you can redistribute it and/or modify it under the terms of - * the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This SCTP implementation is distributed in the hope that it - * will be useful, but WITHOUT ANY WARRANTY; without even the implied - * ************************ - * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU CC; see the file COPYING. If not, see - * <http://www.gnu.org/licenses/>. - * - * Please send any bug reports or fixes you make to the - * email address(es): - * lksctp developers <linux-sctp@vger.kernel.org> - * - * Written or modified by: - * La Monte H.P. Yarroll <piggy@acm.org> - * Karl Knutson <karl@athena.chicago.il.us> - */ - -#include <linux/types.h> -#include <net/sctp/sctp.h> -#include <net/sctp/sm.h> - -/* Initialize a block of memory as a command sequence. */ -int sctp_init_cmd_seq(sctp_cmd_seq_t *seq) -{ - memset(seq, 0, sizeof(sctp_cmd_seq_t)); - return 1; /* We always succeed. */ -} - -/* Add a command to a sctp_cmd_seq_t. - * Return 0 if the command sequence is full. - */ -void sctp_add_cmd_sf(sctp_cmd_seq_t *seq, sctp_verb_t verb, sctp_arg_t obj) -{ - BUG_ON(seq->next_free_slot >= SCTP_MAX_NUM_COMMANDS); - - seq->cmds[seq->next_free_slot].verb = verb; - seq->cmds[seq->next_free_slot++].obj = obj; -} - -/* Return the next command structure in a sctp_cmd_seq. - * Returns NULL at the end of the sequence. - */ -sctp_cmd_t *sctp_next_cmd(sctp_cmd_seq_t *seq) -{ - sctp_cmd_t *retval = NULL; - - if (seq->next_cmd < seq->next_free_slot) - retval = &seq->cmds[seq->next_cmd++]; - - return retval; -} - diff --git a/net/sctp/output.c b/net/sctp/output.c index 01ab8e0723f0..1eedba5195a3 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -178,7 +178,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, case SCTP_XMIT_RWND_FULL: case SCTP_XMIT_OK: - case SCTP_XMIT_NAGLE_DELAY: + case SCTP_XMIT_DELAY: break; } @@ -633,7 +633,6 @@ nomem: static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { - sctp_xmit_t retval = SCTP_XMIT_OK; size_t datasize, rwnd, inflight, flight_size; struct sctp_transport *transport = packet->transport; struct sctp_association *asoc = transport->asoc; @@ -658,15 +657,11 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, datasize = sctp_data_size(chunk); - if (datasize > rwnd) { - if (inflight > 0) { - /* We have (at least) one data chunk in flight, - * so we can't fall back to rule 6.1 B). - */ - retval = SCTP_XMIT_RWND_FULL; - goto finish; - } - } + if (datasize > rwnd && inflight > 0) + /* We have (at least) one data chunk in flight, + * so we can't fall back to rule 6.1 B). + */ + return SCTP_XMIT_RWND_FULL; /* RFC 2960 6.1 Transmission of DATA Chunks * @@ -680,36 +675,44 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, * When a Fast Retransmit is being performed the sender SHOULD * ignore the value of cwnd and SHOULD NOT delay retransmission. */ - if (chunk->fast_retransmit != SCTP_NEED_FRTX) - if (flight_size >= transport->cwnd) { - retval = SCTP_XMIT_RWND_FULL; - goto finish; - } + if (chunk->fast_retransmit != SCTP_NEED_FRTX && + flight_size >= transport->cwnd) + return SCTP_XMIT_RWND_FULL; /* Nagle's algorithm to solve small-packet problem: * Inhibit the sending of new chunks when new outgoing data arrives * if any previously transmitted data on the connection remains * unacknowledged. */ - if (!sctp_sk(asoc->base.sk)->nodelay && sctp_packet_empty(packet) && - inflight && sctp_state(asoc, ESTABLISHED)) { - unsigned int max = transport->pathmtu - packet->overhead; - unsigned int len = chunk->skb->len + q->out_qlen; - - /* Check whether this chunk and all the rest of pending - * data will fit or delay in hopes of bundling a full - * sized packet. - * Don't delay large message writes that may have been - * fragmeneted into small peices. - */ - if ((len < max) && chunk->msg->can_delay) { - retval = SCTP_XMIT_NAGLE_DELAY; - goto finish; - } - } -finish: - return retval; + if (sctp_sk(asoc->base.sk)->nodelay) + /* Nagle disabled */ + return SCTP_XMIT_OK; + + if (!sctp_packet_empty(packet)) + /* Append to packet */ + return SCTP_XMIT_OK; + + if (inflight == 0) + /* Nothing unacked */ + return SCTP_XMIT_OK; + + if (!sctp_state(asoc, ESTABLISHED)) + return SCTP_XMIT_OK; + + /* Check whether this chunk and all the rest of pending data will fit + * or delay in hopes of bundling a full sized packet. + */ + if (chunk->skb->len + q->out_qlen >= transport->pathmtu - packet->overhead) + /* Enough data queued to fill a packet */ + return SCTP_XMIT_OK; + + /* Don't delay large message writes that may have been fragmented */ + if (!chunk->msg->can_delay) + return SCTP_XMIT_OK; + + /* Defer until all data acked or packet full */ + return SCTP_XMIT_DELAY; } /* This private function does management things when adding DATA chunk */ diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 9c77947c0597..7e8f0a117106 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -629,7 +629,7 @@ redo: done = 1; break; - case SCTP_XMIT_NAGLE_DELAY: + case SCTP_XMIT_DELAY: /* Send this packet. */ error = sctp_packet_transmit(pkt); @@ -1015,7 +1015,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) switch (status) { case SCTP_XMIT_PMTU_FULL: case SCTP_XMIT_RWND_FULL: - case SCTP_XMIT_NAGLE_DELAY: + case SCTP_XMIT_DELAY: /* We could not append this chunk, so put * the chunk back on the output queue. */ @@ -1025,7 +1025,6 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) sctp_outq_head_data(q, chunk); goto sctp_flush_out; - break; case SCTP_XMIT_OK: /* The sender is in the SHUTDOWN-PENDING state, diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5170a1ff95a1..d3f1ea460c50 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4182,7 +4182,6 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, case SCTP_CID_ACTION_DISCARD: /* Discard the packet. */ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - break; case SCTP_CID_ACTION_DISCARD_ERR: /* Generate an ERROR chunk as response. */ hdr = unk_chunk->chunk_hdr; @@ -4198,11 +4197,9 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, /* Discard the packet. */ sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); return SCTP_DISPOSITION_CONSUME; - break; case SCTP_CID_ACTION_SKIP: /* Skip the chunk. */ return SCTP_DISPOSITION_DISCARD; - break; case SCTP_CID_ACTION_SKIP_ERR: /* Generate an ERROR chunk as response. */ hdr = unk_chunk->chunk_hdr; @@ -4216,7 +4213,6 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, } /* Skip the chunk. */ return SCTP_DISPOSITION_CONSUME; - break; default: break; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 429899689408..743308f40544 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1602,12 +1602,13 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, struct sctp_initmsg *sinit; sctp_assoc_t associd = 0; sctp_cmsgs_t cmsgs = { NULL }; - int err; sctp_scope_t scope; - long timeo; - __u16 sinfo_flags = 0; + bool fill_sinfo_ttl = false; struct sctp_datamsg *datamsg; int msg_flags = msg->msg_flags; + __u16 sinfo_flags = 0; + long timeo; + int err; err = 0; sp = sctp_sk(sk); @@ -1648,10 +1649,21 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, msg_name = msg->msg_name; } - sinfo = cmsgs.info; sinit = cmsgs.init; + if (cmsgs.sinfo != NULL) { + memset(&default_sinfo, 0, sizeof(default_sinfo)); + default_sinfo.sinfo_stream = cmsgs.sinfo->snd_sid; + default_sinfo.sinfo_flags = cmsgs.sinfo->snd_flags; + default_sinfo.sinfo_ppid = cmsgs.sinfo->snd_ppid; + default_sinfo.sinfo_context = cmsgs.sinfo->snd_context; + default_sinfo.sinfo_assoc_id = cmsgs.sinfo->snd_assoc_id; - /* Did the user specify SNDRCVINFO? */ + sinfo = &default_sinfo; + fill_sinfo_ttl = true; + } else { + sinfo = cmsgs.srinfo; + } + /* Did the user specify SNDINFO/SNDRCVINFO? */ if (sinfo) { sinfo_flags = sinfo->sinfo_flags; associd = sinfo->sinfo_assoc_id; @@ -1858,8 +1870,8 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, pr_debug("%s: we have a valid association\n", __func__); if (!sinfo) { - /* If the user didn't specify SNDRCVINFO, make up one with - * some defaults. + /* If the user didn't specify SNDINFO/SNDRCVINFO, make up + * one with some defaults. */ memset(&default_sinfo, 0, sizeof(default_sinfo)); default_sinfo.sinfo_stream = asoc->default_stream; @@ -1868,7 +1880,13 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, default_sinfo.sinfo_context = asoc->default_context; default_sinfo.sinfo_timetolive = asoc->default_timetolive; default_sinfo.sinfo_assoc_id = sctp_assoc2id(asoc); + sinfo = &default_sinfo; + } else if (fill_sinfo_ttl) { + /* In case SNDINFO was specified, we still need to fill + * it with a default ttl from the assoc here. + */ + sinfo->sinfo_timetolive = asoc->default_timetolive; } /* API 7.1.7, the sndbuf size per association bounds the @@ -2042,8 +2060,6 @@ static int sctp_skb_pull(struct sk_buff *skb, int len) * flags - flags sent or received with the user message, see Section * 5 for complete description of the flags. */ -static struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *); - static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) @@ -2094,9 +2110,16 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, sp->pf->skb_msgname(skb, msg->msg_name, addr_len); } + /* Check if we allow SCTP_NXTINFO. */ + if (sp->recvnxtinfo) + sctp_ulpevent_read_nxtinfo(event, msg, sk); + /* Check if we allow SCTP_RCVINFO. */ + if (sp->recvrcvinfo) + sctp_ulpevent_read_rcvinfo(event, msg); /* Check if we allow SCTP_SNDRCVINFO. */ if (sp->subscribe.sctp_data_io_event) sctp_ulpevent_read_sndrcvinfo(event, msg); + #if 0 /* FIXME: we should be calling IP/IPv6 layers. */ if (sk->sk_protinfo.af_inet.cmsg_flags) @@ -2182,8 +2205,13 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen)) return -EFAULT; - /* - * At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, + if (sctp_sk(sk)->subscribe.sctp_data_io_event) + pr_warn_ratelimited(DEPRECATED "%s (pid %d) " + "Requested SCTP_SNDRCVINFO event.\n" + "Use SCTP_RCVINFO through SCTP_RECVRCVINFO option instead.\n", + current->comm, task_pid_nr(current)); + + /* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, * if there is no data to be sent or retransmit, the stack will * immediately send up this notification. */ @@ -2747,19 +2775,22 @@ static int sctp_setsockopt_default_send_param(struct sock *sk, char __user *optval, unsigned int optlen) { - struct sctp_sndrcvinfo info; - struct sctp_association *asoc; struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndrcvinfo info; - if (optlen != sizeof(struct sctp_sndrcvinfo)) + if (optlen != sizeof(info)) return -EINVAL; if (copy_from_user(&info, optval, optlen)) return -EFAULT; + if (info.sinfo_flags & + ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_ABORT | SCTP_EOF)) + return -EINVAL; asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - if (asoc) { asoc->default_stream = info.sinfo_stream; asoc->default_flags = info.sinfo_flags; @@ -2777,6 +2808,44 @@ static int sctp_setsockopt_default_send_param(struct sock *sk, return 0; } +/* RFC6458, Section 8.1.31. Set/get Default Send Parameters + * (SCTP_DEFAULT_SNDINFO) + */ +static int sctp_setsockopt_default_sndinfo(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndinfo info; + + if (optlen != sizeof(info)) + return -EINVAL; + if (copy_from_user(&info, optval, optlen)) + return -EFAULT; + if (info.snd_flags & + ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_ABORT | SCTP_EOF)) + return -EINVAL; + + asoc = sctp_id2assoc(sk, info.snd_assoc_id); + if (!asoc && info.snd_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + if (asoc) { + asoc->default_stream = info.snd_sid; + asoc->default_flags = info.snd_flags; + asoc->default_ppid = info.snd_ppid; + asoc->default_context = info.snd_context; + } else { + sp->default_stream = info.snd_sid; + sp->default_flags = info.snd_flags; + sp->default_ppid = info.snd_ppid; + sp->default_context = info.snd_context; + } + + return 0; +} + /* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR) * * Requests that the local SCTP stack use the enclosed peer address as @@ -3523,7 +3592,6 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, return 0; } - /* * SCTP_PEER_ADDR_THLDS * @@ -3574,6 +3642,38 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, return 0; } +static int sctp_setsockopt_recvrcvinfo(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *) optval)) + return -EFAULT; + + sctp_sk(sk)->recvrcvinfo = (val == 0) ? 0 : 1; + + return 0; +} + +static int sctp_setsockopt_recvnxtinfo(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *) optval)) + return -EFAULT; + + sctp_sk(sk)->recvnxtinfo = (val == 0) ? 0 : 1; + + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -3671,6 +3771,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_default_send_param(sk, optval, optlen); break; + case SCTP_DEFAULT_SNDINFO: + retval = sctp_setsockopt_default_sndinfo(sk, optval, optlen); + break; case SCTP_PRIMARY_ADDR: retval = sctp_setsockopt_primary_addr(sk, optval, optlen); break; @@ -3725,6 +3828,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_PEER_ADDR_THLDS: retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen); break; + case SCTP_RECVRCVINFO: + retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen); + break; + case SCTP_RECVNXTINFO: + retval = sctp_setsockopt_recvnxtinfo(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -3971,6 +4080,9 @@ static int sctp_init_sock(struct sock *sk) /* Enable Nagle algorithm by default. */ sp->nodelay = 0; + sp->recvrcvinfo = 0; + sp->recvnxtinfo = 0; + /* Enable by default. */ sp->v4mapped = 1; @@ -4964,14 +5076,14 @@ static int sctp_getsockopt_default_send_param(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct sctp_sndrcvinfo info; - struct sctp_association *asoc; struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndrcvinfo info; - if (len < sizeof(struct sctp_sndrcvinfo)) + if (len < sizeof(info)) return -EINVAL; - len = sizeof(struct sctp_sndrcvinfo); + len = sizeof(info); if (copy_from_user(&info, optval, len)) return -EFAULT; @@ -4979,7 +5091,6 @@ static int sctp_getsockopt_default_send_param(struct sock *sk, asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - if (asoc) { info.sinfo_stream = asoc->default_stream; info.sinfo_flags = asoc->default_flags; @@ -5002,6 +5113,48 @@ static int sctp_getsockopt_default_send_param(struct sock *sk, return 0; } +/* RFC6458, Section 8.1.31. Set/get Default Send Parameters + * (SCTP_DEFAULT_SNDINFO) + */ +static int sctp_getsockopt_default_sndinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndinfo info; + + if (len < sizeof(info)) + return -EINVAL; + + len = sizeof(info); + + if (copy_from_user(&info, optval, len)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, info.snd_assoc_id); + if (!asoc && info.snd_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + if (asoc) { + info.snd_sid = asoc->default_stream; + info.snd_flags = asoc->default_flags; + info.snd_ppid = asoc->default_ppid; + info.snd_context = asoc->default_context; + } else { + info.snd_sid = sp->default_stream; + info.snd_flags = sp->default_flags; + info.snd_ppid = sp->default_ppid; + info.snd_context = sp->default_context; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + + return 0; +} + /* * * 7.1.5 SCTP_NODELAY @@ -5752,6 +5905,46 @@ static int sctp_getsockopt_assoc_stats(struct sock *sk, int len, return 0; } +static int sctp_getsockopt_recvrcvinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + int val = 0; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + if (sctp_sk(sk)->recvrcvinfo) + val = 1; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +static int sctp_getsockopt_recvnxtinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + int val = 0; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + if (sctp_sk(sk)->recvnxtinfo) + val = 1; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -5821,6 +6014,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_default_send_param(sk, len, optval, optlen); break; + case SCTP_DEFAULT_SNDINFO: + retval = sctp_getsockopt_default_sndinfo(sk, len, + optval, optlen); + break; case SCTP_PRIMARY_ADDR: retval = sctp_getsockopt_primary_addr(sk, len, optval, optlen); break; @@ -5895,6 +6092,12 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_GET_ASSOC_STATS: retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen); break; + case SCTP_RECVRCVINFO: + retval = sctp_getsockopt_recvrcvinfo(sk, len, optval, optlen); + break; + case SCTP_RECVNXTINFO: + retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6390,8 +6593,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) struct cmsghdr *cmsg; struct msghdr *my_msg = (struct msghdr *)msg; - for (cmsg = CMSG_FIRSTHDR(msg); - cmsg != NULL; + for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; cmsg = CMSG_NXTHDR(my_msg, cmsg)) { if (!CMSG_OK(my_msg, cmsg)) return -EINVAL; @@ -6404,7 +6606,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) switch (cmsg->cmsg_type) { case SCTP_INIT: /* SCTP Socket API Extension - * 5.2.1 SCTP Initiation Structure (SCTP_INIT) + * 5.3.1 SCTP Initiation Structure (SCTP_INIT) * * This cmsghdr structure provides information for * initializing new SCTP associations with sendmsg(). @@ -6416,15 +6618,15 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_INIT struct sctp_initmsg */ - if (cmsg->cmsg_len != - CMSG_LEN(sizeof(struct sctp_initmsg))) + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_initmsg))) return -EINVAL; - cmsgs->init = (struct sctp_initmsg *)CMSG_DATA(cmsg); + + cmsgs->init = CMSG_DATA(cmsg); break; case SCTP_SNDRCV: /* SCTP Socket API Extension - * 5.2.2 SCTP Header Information Structure(SCTP_SNDRCV) + * 5.3.2 SCTP Header Information Structure(SCTP_SNDRCV) * * This cmsghdr structure specifies SCTP options for * sendmsg() and describes SCTP header information @@ -6434,24 +6636,44 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_SNDRCV struct sctp_sndrcvinfo */ - if (cmsg->cmsg_len != - CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) return -EINVAL; - cmsgs->info = - (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + cmsgs->srinfo = CMSG_DATA(cmsg); - /* Minimally, validate the sinfo_flags. */ - if (cmsgs->info->sinfo_flags & + if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; + case SCTP_SNDINFO: + /* SCTP Socket API Extension + * 5.3.4 SCTP Send Information Structure (SCTP_SNDINFO) + * + * This cmsghdr structure specifies SCTP options for + * sendmsg(). This structure and SCTP_RCVINFO replaces + * SCTP_SNDRCV which has been deprecated. + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_SNDINFO struct sctp_sndinfo + */ + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndinfo))) + return -EINVAL; + + cmsgs->sinfo = CMSG_DATA(cmsg); + + if (cmsgs->sinfo->snd_flags & + ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_ABORT | SCTP_EOF)) + return -EINVAL; + break; default: return -EINVAL; } } + return 0; } @@ -6518,8 +6740,8 @@ out: * Note: This is pretty much the same routine as in core/datagram.c * with a few changes to make lksctp work. */ -static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, - int noblock, int *err) +struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, + int noblock, int *err) { int error; struct sk_buff *skb; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index dcb19592761e..2e9ada10fd84 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -321,41 +321,40 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; - char tmp[8]; struct ctl_table tbl; - int ret; - int changed = 0; + bool changed = false; char *none = "none"; + char tmp[8]; + int ret; memset(&tbl, 0, sizeof(struct ctl_table)); if (write) { tbl.data = tmp; - tbl.maxlen = 8; + tbl.maxlen = sizeof(tmp); } else { tbl.data = net->sctp.sctp_hmac_alg ? : none; tbl.maxlen = strlen(tbl.data); } - ret = proc_dostring(&tbl, write, buffer, lenp, ppos); - if (write) { + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { #ifdef CONFIG_CRYPTO_MD5 if (!strncmp(tmp, "md5", 3)) { net->sctp.sctp_hmac_alg = "md5"; - changed = 1; + changed = true; } #endif #ifdef CONFIG_CRYPTO_SHA1 if (!strncmp(tmp, "sha1", 4)) { net->sctp.sctp_hmac_alg = "sha1"; - changed = 1; + changed = true; } #endif if (!strncmp(tmp, "none", 4)) { net->sctp.sctp_hmac_alg = NULL; - changed = 1; + changed = true; } - if (!changed) ret = -EINVAL; } @@ -368,11 +367,10 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; - int new_value; - struct ctl_table tbl; unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; - int ret; + struct ctl_table tbl; + int ret, new_value; memset(&tbl, 0, sizeof(struct ctl_table)); tbl.maxlen = sizeof(unsigned int); @@ -381,12 +379,15 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, tbl.data = &new_value; else tbl.data = &net->sctp.rto_min; + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); - if (write) { - if (ret || new_value > max || new_value < min) + if (write && ret == 0) { + if (new_value > max || new_value < min) return -EINVAL; + net->sctp.rto_min = new_value; } + return ret; } @@ -395,11 +396,10 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; - int new_value; - struct ctl_table tbl; unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; - int ret; + struct ctl_table tbl; + int ret, new_value; memset(&tbl, 0, sizeof(struct ctl_table)); tbl.maxlen = sizeof(unsigned int); @@ -408,12 +408,15 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, tbl.data = &new_value; else tbl.data = &net->sctp.rto_max; + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); - if (write) { - if (ret || new_value > max || new_value < min) + if (write && ret == 0) { + if (new_value > max || new_value < min) return -EINVAL; + net->sctp.rto_max = new_value; } + return ret; } @@ -421,8 +424,9 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - pr_warn_once("Changing rto_alpha or rto_beta may lead to " - "suboptimal rtt/srtt estimations!\n"); + if (write) + pr_warn_once("Changing rto_alpha or rto_beta may lead to " + "suboptimal rtt/srtt estimations!\n"); return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); } @@ -444,8 +448,7 @@ static int proc_sctp_do_auth(struct ctl_table *ctl, int write, tbl.data = &net->sctp.auth_enable; ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); - - if (write) { + if (write && ret == 0) { struct sock *sk = net->sctp.ctl_sock; net->sctp.auth_enable = new_value; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 7dd672fa651f..b10e047bbd15 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -594,15 +594,16 @@ void sctp_transport_burst_reset(struct sctp_transport *t) } /* What is the next timeout value for this transport? */ -unsigned long sctp_transport_timeout(struct sctp_transport *t) +unsigned long sctp_transport_timeout(struct sctp_transport *trans) { - unsigned long timeout; - timeout = t->rto + sctp_jitter(t->rto); - if ((t->state != SCTP_UNCONFIRMED) && - (t->state != SCTP_PF)) - timeout += t->hbinterval; - timeout += jiffies; - return timeout; + /* RTO + timer slack +/- 50% of RTO */ + unsigned long timeout = (trans->rto >> 1) + prandom_u32_max(trans->rto); + + if (trans->state != SCTP_UNCONFIRMED && + trans->state != SCTP_PF) + timeout += trans->hbinterval; + + return timeout + jiffies; } /* Reset transport variables to their initial values */ diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 85c64658bd0b..e049298ecfa0 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -366,9 +366,10 @@ fail: * specification [SCTP] and any extensions for a list of possible * error formats. */ -struct sctp_ulpevent *sctp_ulpevent_make_remote_error( - const struct sctp_association *asoc, struct sctp_chunk *chunk, - __u16 flags, gfp_t gfp) +struct sctp_ulpevent * +sctp_ulpevent_make_remote_error(const struct sctp_association *asoc, + struct sctp_chunk *chunk, __u16 flags, + gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_remote_error *sre; @@ -387,8 +388,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_remote_error( /* Copy the skb to a new skb with room for us to prepend * notification with. */ - skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_remote_error), - 0, gfp); + skb = skb_copy_expand(chunk->skb, sizeof(*sre), 0, gfp); /* Pull off the rest of the cause TLV from the chunk. */ skb_pull(chunk->skb, elen); @@ -399,62 +399,21 @@ struct sctp_ulpevent *sctp_ulpevent_make_remote_error( event = sctp_skb2event(skb); sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); - sre = (struct sctp_remote_error *) - skb_push(skb, sizeof(struct sctp_remote_error)); + sre = (struct sctp_remote_error *) skb_push(skb, sizeof(*sre)); /* Trim the buffer to the right length. */ - skb_trim(skb, sizeof(struct sctp_remote_error) + elen); + skb_trim(skb, sizeof(*sre) + elen); - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_type: - * It should be SCTP_REMOTE_ERROR. - */ + /* RFC6458, Section 6.1.3. SCTP_REMOTE_ERROR */ + memset(sre, 0, sizeof(*sre)); sre->sre_type = SCTP_REMOTE_ERROR; - - /* - * Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_flags: 16 bits (unsigned integer) - * Currently unused. - */ sre->sre_flags = 0; - - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_length: sizeof (__u32) - * - * This field is the total length of the notification data, - * including the notification header. - */ sre->sre_length = skb->len; - - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_error: 16 bits (unsigned integer) - * This value represents one of the Operational Error causes defined in - * the SCTP specification, in network byte order. - */ sre->sre_error = cause; - - /* Socket Extensions for SCTP - * 5.3.1.3 SCTP_REMOTE_ERROR - * - * sre_assoc_id: sizeof (sctp_assoc_t) - * - * The association id field, holds the identifier for the association. - * All notifications for a given association have the same association - * identifier. For TCP style socket, this field is ignored. - */ sctp_ulpevent_set_owner(event, asoc); sre->sre_assoc_id = sctp_assoc2id(asoc); return event; - fail: return NULL; } @@ -899,7 +858,9 @@ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event) return notification->sn_header.sn_type; } -/* Copy out the sndrcvinfo into a msghdr. */ +/* RFC6458, Section 5.3.2. SCTP Header Information Structure + * (SCTP_SNDRCV, DEPRECATED) + */ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, struct msghdr *msghdr) { @@ -908,74 +869,84 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, if (sctp_ulpevent_is_notification(event)) return; - /* Sockets API Extensions for SCTP - * Section 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV) - * - * sinfo_stream: 16 bits (unsigned integer) - * - * For recvmsg() the SCTP stack places the message's stream number in - * this value. - */ + memset(&sinfo, 0, sizeof(sinfo)); sinfo.sinfo_stream = event->stream; - /* sinfo_ssn: 16 bits (unsigned integer) - * - * For recvmsg() this value contains the stream sequence number that - * the remote endpoint placed in the DATA chunk. For fragmented - * messages this is the same number for all deliveries of the message - * (if more than one recvmsg() is needed to read the message). - */ sinfo.sinfo_ssn = event->ssn; - /* sinfo_ppid: 32 bits (unsigned integer) - * - * In recvmsg() this value is - * the same information that was passed by the upper layer in the peer - * application. Please note that byte order issues are NOT accounted - * for and this information is passed opaquely by the SCTP stack from - * one end to the other. - */ sinfo.sinfo_ppid = event->ppid; - /* sinfo_flags: 16 bits (unsigned integer) - * - * This field may contain any of the following flags and is composed of - * a bitwise OR of these values. - * - * recvmsg() flags: - * - * SCTP_UNORDERED - This flag is present when the message was sent - * non-ordered. - */ sinfo.sinfo_flags = event->flags; - /* sinfo_tsn: 32 bit (unsigned integer) - * - * For the receiving side, this field holds a TSN that was - * assigned to one of the SCTP Data Chunks. - */ sinfo.sinfo_tsn = event->tsn; - /* sinfo_cumtsn: 32 bit (unsigned integer) - * - * This field will hold the current cumulative TSN as - * known by the underlying SCTP layer. Note this field is - * ignored when sending and only valid for a receive - * operation when sinfo_flags are set to SCTP_UNORDERED. - */ sinfo.sinfo_cumtsn = event->cumtsn; - /* sinfo_assoc_id: sizeof (sctp_assoc_t) - * - * The association handle field, sinfo_assoc_id, holds the identifier - * for the association announced in the COMMUNICATION_UP notification. - * All notifications for a given association have the same identifier. - * Ignored for one-to-one style sockets. - */ sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc); - - /* context value that is set via SCTP_CONTEXT socket option. */ + /* Context value that is set via SCTP_CONTEXT socket option. */ sinfo.sinfo_context = event->asoc->default_rcv_context; - /* These fields are not used while receiving. */ sinfo.sinfo_timetolive = 0; put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV, - sizeof(struct sctp_sndrcvinfo), (void *)&sinfo); + sizeof(sinfo), &sinfo); +} + +/* RFC6458, Section 5.3.5 SCTP Receive Information Structure + * (SCTP_SNDRCV) + */ +void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, + struct msghdr *msghdr) +{ + struct sctp_rcvinfo rinfo; + + if (sctp_ulpevent_is_notification(event)) + return; + + memset(&rinfo, 0, sizeof(struct sctp_rcvinfo)); + rinfo.rcv_sid = event->stream; + rinfo.rcv_ssn = event->ssn; + rinfo.rcv_ppid = event->ppid; + rinfo.rcv_flags = event->flags; + rinfo.rcv_tsn = event->tsn; + rinfo.rcv_cumtsn = event->cumtsn; + rinfo.rcv_assoc_id = sctp_assoc2id(event->asoc); + rinfo.rcv_context = event->asoc->default_rcv_context; + + put_cmsg(msghdr, IPPROTO_SCTP, SCTP_RCVINFO, + sizeof(rinfo), &rinfo); +} + +/* RFC6458, Section 5.3.6. SCTP Next Receive Information Structure + * (SCTP_NXTINFO) + */ +static void __sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, + struct msghdr *msghdr, + const struct sk_buff *skb) +{ + struct sctp_nxtinfo nxtinfo; + + memset(&nxtinfo, 0, sizeof(nxtinfo)); + nxtinfo.nxt_sid = event->stream; + nxtinfo.nxt_ppid = event->ppid; + nxtinfo.nxt_flags = event->flags; + if (sctp_ulpevent_is_notification(event)) + nxtinfo.nxt_flags |= SCTP_NOTIFICATION; + nxtinfo.nxt_length = skb->len; + nxtinfo.nxt_assoc_id = sctp_assoc2id(event->asoc); + + put_cmsg(msghdr, IPPROTO_SCTP, SCTP_NXTINFO, + sizeof(nxtinfo), &nxtinfo); +} + +void sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, + struct msghdr *msghdr, + struct sock *sk) +{ + struct sk_buff *skb; + int err; + + skb = sctp_skb_recv_datagram(sk, MSG_PEEK, 1, &err); + if (skb != NULL) { + __sctp_ulpevent_read_nxtinfo(sctp_skb2event(skb), + msghdr, skb); + /* Just release refcount here. */ + kfree_skb(skb); + } } /* Do accounting for bytes received and hold a reference to the association diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 247e973544bf..f77366717420 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -592,6 +592,7 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags) put_group_info(acred.group_info); return ret; } +EXPORT_SYMBOL_GPL(rpcauth_lookupcred); void rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 26631679a1fa..dd13bfa09333 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -1,7 +1,7 @@ /* * net/tipc/bcast.c: TIPC broadcast code * - * Copyright (c) 2004-2006, Ericsson AB + * Copyright (c) 2004-2006, 2014, Ericsson AB * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. @@ -38,6 +38,8 @@ #include "core.h" #include "link.h" #include "port.h" +#include "socket.h" +#include "msg.h" #include "bcast.h" #include "name_distr.h" @@ -138,6 +140,11 @@ static void tipc_bclink_unlock(void) tipc_link_reset_all(node); } +uint tipc_bclink_get_mtu(void) +{ + return MAX_PKT_DEFAULT_MCAST; +} + void tipc_bclink_set_flags(unsigned int flags) { bclink->flags |= flags; @@ -382,30 +389,50 @@ static void bclink_peek_nack(struct tipc_msg *msg) tipc_node_unlock(n_ptr); } -/* - * tipc_bclink_xmit - broadcast a packet to all nodes in cluster +/* tipc_bclink_xmit - broadcast buffer chain to all nodes in cluster + * and to identified node local sockets + * @buf: chain of buffers containing message + * Consumes the buffer chain, except when returning -ELINKCONG + * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE */ int tipc_bclink_xmit(struct sk_buff *buf) { - int res; + int rc = 0; + int bc = 0; + struct sk_buff *clbuf; - tipc_bclink_lock(); - - if (!bclink->bcast_nodes.count) { - res = msg_data_sz(buf_msg(buf)); - kfree_skb(buf); - goto exit; + /* Prepare clone of message for local node */ + clbuf = tipc_msg_reassemble(buf); + if (unlikely(!clbuf)) { + kfree_skb_list(buf); + return -EHOSTUNREACH; } - res = __tipc_link_xmit(bcl, buf); - if (likely(res >= 0)) { - bclink_set_last_sent(); - bcl->stats.queue_sz_counts++; - bcl->stats.accu_queue_sz += bcl->out_queue_size; + /* Broadcast to all other nodes */ + if (likely(bclink)) { + tipc_bclink_lock(); + if (likely(bclink->bcast_nodes.count)) { + rc = __tipc_link_xmit(bcl, buf); + if (likely(!rc)) { + bclink_set_last_sent(); + bcl->stats.queue_sz_counts++; + bcl->stats.accu_queue_sz += bcl->out_queue_size; + } + bc = 1; + } + tipc_bclink_unlock(); } -exit: - tipc_bclink_unlock(); - return res; + + if (unlikely(!bc)) + kfree_skb_list(buf); + + /* Deliver message clone */ + if (likely(!rc)) + tipc_sk_mcast_rcv(clbuf); + else + kfree_skb(clbuf); + + return rc; } /** @@ -443,7 +470,7 @@ void tipc_bclink_rcv(struct sk_buff *buf) struct tipc_node *node; u32 next_in; u32 seqno; - int deferred; + int deferred = 0; /* Screen out unwanted broadcast messages */ @@ -494,7 +521,7 @@ receive: tipc_bclink_unlock(); tipc_node_unlock(node); if (likely(msg_mcast(msg))) - tipc_port_mcast_rcv(buf, NULL); + tipc_sk_mcast_rcv(buf); else kfree_skb(buf); } else if (msg_user(msg) == MSG_BUNDLER) { @@ -559,6 +586,7 @@ receive: buf = node->bclink.deferred_head; node->bclink.deferred_head = buf->next; + buf->next = NULL; node->bclink.deferred_size--; goto receive; } @@ -572,8 +600,7 @@ receive: node->bclink.deferred_size += deferred; bclink_update_last_sent(node, seqno); buf = NULL; - } else - deferred = 0; + } tipc_bclink_lock(); @@ -610,6 +637,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf, struct tipc_bearer *unused1, struct tipc_media_addr *unused2) { int bp_index; + struct tipc_msg *msg = buf_msg(buf); /* Prepare broadcast link message for reliable transmission, * if first time trying to send it; @@ -617,10 +645,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf, struct tipc_bearer *unused1, * since they are sent in an unreliable manner and don't need it */ if (likely(!msg_non_seq(buf_msg(buf)))) { - struct tipc_msg *msg; - bcbuf_set_acks(buf, bclink->bcast_nodes.count); - msg = buf_msg(buf); msg_set_non_seq(msg, 1); msg_set_mc_netid(msg, tipc_net_id); bcl->stats.sent_info++; @@ -637,12 +662,14 @@ static int tipc_bcbearer_send(struct sk_buff *buf, struct tipc_bearer *unused1, for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) { struct tipc_bearer *p = bcbearer->bpairs[bp_index].primary; struct tipc_bearer *s = bcbearer->bpairs[bp_index].secondary; - struct tipc_bearer *b = p; + struct tipc_bearer *bp[2] = {p, s}; + struct tipc_bearer *b = bp[msg_link_selector(msg)]; struct sk_buff *tbuf; if (!p) break; /* No more bearers to try */ - + if (!b) + b = p; tipc_nmap_diff(&bcbearer->remains, &b->nodes, &bcbearer->remains_new); if (bcbearer->remains_new.count == bcbearer->remains.count) @@ -659,13 +686,6 @@ static int tipc_bcbearer_send(struct sk_buff *buf, struct tipc_bearer *unused1, tipc_bearer_send(b->identity, tbuf, &b->bcast_addr); kfree_skb(tbuf); /* Bearer keeps a clone */ } - - /* Swap bearers for next packet */ - if (s) { - bcbearer->bpairs[bp_index].primary = s; - bcbearer->bpairs[bp_index].secondary = p; - } - if (bcbearer->remains_new.count == 0) break; /* All targets reached */ diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 00330c45df3e..4875d9536aee 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -1,7 +1,7 @@ /* * net/tipc/bcast.h: Include file for TIPC broadcast code * - * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2003-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -89,7 +89,6 @@ void tipc_bclink_add_node(u32 addr); void tipc_bclink_remove_node(u32 addr); struct tipc_node *tipc_bclink_retransmit_to(void); void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked); -int tipc_bclink_xmit(struct sk_buff *buf); void tipc_bclink_rcv(struct sk_buff *buf); u32 tipc_bclink_get_last_sent(void); u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr); @@ -98,5 +97,7 @@ int tipc_bclink_stats(char *stats_buf, const u32 buf_size); int tipc_bclink_reset_stats(void); int tipc_bclink_set_queue_limits(u32 limit); void tipc_bcbearer_sort(struct tipc_node_map *nm_ptr, u32 node, bool action); +uint tipc_bclink_get_mtu(void); +int tipc_bclink_xmit(struct sk_buff *buf); #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index ad2c57f5868d..fb1485dc6736 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -82,15 +82,13 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, struct sk_buff *buf); static int tipc_link_tunnel_rcv(struct tipc_node *n_ptr, struct sk_buff **buf); static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tolerance); -static int tipc_link_iovec_long_xmit(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destnode); static void link_state_event(struct tipc_link *l_ptr, u32 event); static void link_reset_statistics(struct tipc_link *l_ptr); static void link_print(struct tipc_link *l_ptr, const char *str); -static int tipc_link_frag_xmit(struct tipc_link *l_ptr, struct sk_buff *buf); static void tipc_link_sync_xmit(struct tipc_link *l); static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); +static int tipc_link_input(struct tipc_link *l, struct sk_buff *buf); +static int tipc_link_prepare_input(struct tipc_link *l, struct sk_buff **buf); /* * Simple link routines @@ -335,13 +333,15 @@ void tipc_link_delete_list(unsigned int bearer_id, bool shutting_down) static int link_schedule_port(struct tipc_link *l_ptr, u32 origport, u32 sz) { struct tipc_port *p_ptr; + struct tipc_sock *tsk; spin_lock_bh(&tipc_port_list_lock); p_ptr = tipc_port_lock(origport); if (p_ptr) { if (!list_empty(&p_ptr->wait_list)) goto exit; - p_ptr->congested = 1; + tsk = tipc_port_to_sock(p_ptr); + tsk->link_cong = 1; p_ptr->waiting_pkts = 1 + ((sz - 1) / l_ptr->max_pkt); list_add_tail(&p_ptr->wait_list, &l_ptr->waiting_ports); l_ptr->stats.link_congs++; @@ -355,6 +355,7 @@ exit: void tipc_link_wakeup_ports(struct tipc_link *l_ptr, int all) { struct tipc_port *p_ptr; + struct tipc_sock *tsk; struct tipc_port *temp_p_ptr; int win = l_ptr->queue_limit[0] - l_ptr->out_queue_size; @@ -370,10 +371,11 @@ void tipc_link_wakeup_ports(struct tipc_link *l_ptr, int all) wait_list) { if (win <= 0) break; + tsk = tipc_port_to_sock(p_ptr); list_del_init(&p_ptr->wait_list); spin_lock_bh(p_ptr->lock); - p_ptr->congested = 0; - tipc_port_wakeup(p_ptr); + tsk->link_cong = 0; + tipc_sock_wakeup(tsk); win -= p_ptr->waiting_pkts; spin_unlock_bh(p_ptr->lock); } @@ -676,178 +678,142 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) } } -/* - * link_bundle_buf(): Append contents of a buffer to - * the tail of an existing one. +/* tipc_link_cong: determine return value and how to treat the + * sent buffer during link congestion. + * - For plain, errorless user data messages we keep the buffer and + * return -ELINKONG. + * - For all other messages we discard the buffer and return -EHOSTUNREACH + * - For TIPC internal messages we also reset the link */ -static int link_bundle_buf(struct tipc_link *l_ptr, struct sk_buff *bundler, - struct sk_buff *buf) +static int tipc_link_cong(struct tipc_link *link, struct sk_buff *buf) { - struct tipc_msg *bundler_msg = buf_msg(bundler); struct tipc_msg *msg = buf_msg(buf); - u32 size = msg_size(msg); - u32 bundle_size = msg_size(bundler_msg); - u32 to_pos = align(bundle_size); - u32 pad = to_pos - bundle_size; - - if (msg_user(bundler_msg) != MSG_BUNDLER) - return 0; - if (msg_type(bundler_msg) != OPEN_MSG) - return 0; - if (skb_tailroom(bundler) < (pad + size)) - return 0; - if (l_ptr->max_pkt < (to_pos + size)) - return 0; - - skb_put(bundler, pad + size); - skb_copy_to_linear_data_offset(bundler, to_pos, buf->data, size); - msg_set_size(bundler_msg, to_pos + size); - msg_set_msgcnt(bundler_msg, msg_msgcnt(bundler_msg) + 1); - kfree_skb(buf); - l_ptr->stats.sent_bundled++; - return 1; -} - -static void link_add_to_outqueue(struct tipc_link *l_ptr, - struct sk_buff *buf, - struct tipc_msg *msg) -{ - u32 ack = mod(l_ptr->next_in_no - 1); - u32 seqno = mod(l_ptr->next_out_no++); + uint psz = msg_size(msg); + uint imp = tipc_msg_tot_importance(msg); + u32 oport = msg_tot_origport(msg); - msg_set_word(msg, 2, ((ack << 16) | seqno)); - msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - buf->next = NULL; - if (l_ptr->first_out) { - l_ptr->last_out->next = buf; - l_ptr->last_out = buf; - } else - l_ptr->first_out = l_ptr->last_out = buf; - - l_ptr->out_queue_size++; - if (l_ptr->out_queue_size > l_ptr->stats.max_queue_sz) - l_ptr->stats.max_queue_sz = l_ptr->out_queue_size; -} - -static void link_add_chain_to_outqueue(struct tipc_link *l_ptr, - struct sk_buff *buf_chain, - u32 long_msgno) -{ - struct sk_buff *buf; - struct tipc_msg *msg; - - if (!l_ptr->next_out) - l_ptr->next_out = buf_chain; - while (buf_chain) { - buf = buf_chain; - buf_chain = buf_chain->next; - - msg = buf_msg(buf); - msg_set_long_msgno(msg, long_msgno); - link_add_to_outqueue(l_ptr, buf, msg); + if (likely(imp <= TIPC_CRITICAL_IMPORTANCE)) { + if (!msg_errcode(msg) && !msg_reroute_cnt(msg)) { + link_schedule_port(link, oport, psz); + return -ELINKCONG; + } + } else { + pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); + tipc_link_reset(link); } + kfree_skb_list(buf); + return -EHOSTUNREACH; } -/* - * tipc_link_xmit() is the 'full path' for messages, called from - * inside TIPC when the 'fast path' in tipc_send_xmit - * has failed, and from link_send() +/** + * __tipc_link_xmit(): same as tipc_link_xmit, but destlink is known & locked + * @link: link to use + * @buf: chain of buffers containing message + * Consumes the buffer chain, except when returning -ELINKCONG + * Returns 0 if success, otherwise errno: -ELINKCONG, -EMSGSIZE (plain socket + * user data messages) or -EHOSTUNREACH (all other messages/senders) + * Only the socket functions tipc_send_stream() and tipc_send_packet() need + * to act on the return value, since they may need to do more send attempts. */ -int __tipc_link_xmit(struct tipc_link *l_ptr, struct sk_buff *buf) +int __tipc_link_xmit(struct tipc_link *link, struct sk_buff *buf) { struct tipc_msg *msg = buf_msg(buf); - u32 size = msg_size(msg); - u32 dsz = msg_data_sz(msg); - u32 queue_size = l_ptr->out_queue_size; - u32 imp = tipc_msg_tot_importance(msg); - u32 queue_limit = l_ptr->queue_limit[imp]; - u32 max_packet = l_ptr->max_pkt; - - /* Match msg importance against queue limits: */ - if (unlikely(queue_size >= queue_limit)) { - if (imp <= TIPC_CRITICAL_IMPORTANCE) { - link_schedule_port(l_ptr, msg_origport(msg), size); - kfree_skb(buf); - return -ELINKCONG; - } - kfree_skb(buf); - if (imp > CONN_MANAGER) { - pr_warn("%s<%s>, send queue full", link_rst_msg, - l_ptr->name); - tipc_link_reset(l_ptr); - } - return dsz; + uint psz = msg_size(msg); + uint qsz = link->out_queue_size; + uint sndlim = link->queue_limit[0]; + uint imp = tipc_msg_tot_importance(msg); + uint mtu = link->max_pkt; + uint ack = mod(link->next_in_no - 1); + uint seqno = link->next_out_no; + uint bc_last_in = link->owner->bclink.last_in; + struct tipc_media_addr *addr = &link->media_addr; + struct sk_buff *next = buf->next; + + /* Match queue limits against msg importance: */ + if (unlikely(qsz >= link->queue_limit[imp])) + return tipc_link_cong(link, buf); + + /* Has valid packet limit been used ? */ + if (unlikely(psz > mtu)) { + kfree_skb_list(buf); + return -EMSGSIZE; } - /* Fragmentation needed ? */ - if (size > max_packet) - return tipc_link_frag_xmit(l_ptr, buf); - - /* Packet can be queued or sent. */ - if (likely(!link_congested(l_ptr))) { - link_add_to_outqueue(l_ptr, buf, msg); + /* Prepare each packet for sending, and add to outqueue: */ + while (buf) { + next = buf->next; + msg = buf_msg(buf); + msg_set_word(msg, 2, ((ack << 16) | mod(seqno))); + msg_set_bcast_ack(msg, bc_last_in); + + if (!link->first_out) { + link->first_out = buf; + } else if (qsz < sndlim) { + link->last_out->next = buf; + } else if (tipc_msg_bundle(link->last_out, buf, mtu)) { + link->stats.sent_bundled++; + buf = next; + next = buf->next; + continue; + } else if (tipc_msg_make_bundle(&buf, mtu, link->addr)) { + link->stats.sent_bundled++; + link->stats.sent_bundles++; + link->last_out->next = buf; + if (!link->next_out) + link->next_out = buf; + } else { + link->last_out->next = buf; + if (!link->next_out) + link->next_out = buf; + } - tipc_bearer_send(l_ptr->bearer_id, buf, &l_ptr->media_addr); - l_ptr->unacked_window = 0; - return dsz; - } - /* Congestion: can message be bundled ? */ - if ((msg_user(msg) != CHANGEOVER_PROTOCOL) && - (msg_user(msg) != MSG_FRAGMENTER)) { - - /* Try adding message to an existing bundle */ - if (l_ptr->next_out && - link_bundle_buf(l_ptr, l_ptr->last_out, buf)) - return dsz; - - /* Try creating a new bundle */ - if (size <= max_packet * 2 / 3) { - struct sk_buff *bundler = tipc_buf_acquire(max_packet); - struct tipc_msg bundler_hdr; - - if (bundler) { - tipc_msg_init(&bundler_hdr, MSG_BUNDLER, OPEN_MSG, - INT_H_SIZE, l_ptr->addr); - skb_copy_to_linear_data(bundler, &bundler_hdr, - INT_H_SIZE); - skb_trim(bundler, INT_H_SIZE); - link_bundle_buf(l_ptr, bundler, buf); - buf = bundler; - msg = buf_msg(buf); - l_ptr->stats.sent_bundles++; - } + /* Send packet if possible: */ + if (likely(++qsz <= sndlim)) { + tipc_bearer_send(link->bearer_id, buf, addr); + link->next_out = next; + link->unacked_window = 0; } + seqno++; + link->last_out = buf; + buf = next; } - if (!l_ptr->next_out) - l_ptr->next_out = buf; - link_add_to_outqueue(l_ptr, buf, msg); - return dsz; + link->next_out_no = seqno; + link->out_queue_size = qsz; + return 0; } -/* - * tipc_link_xmit(): same as __tipc_link_xmit(), but the link to use - * has not been selected yet, and the the owner node is not locked - * Called by TIPC internal users, e.g. the name distributor +/** + * tipc_link_xmit() is the general link level function for message sending + * @buf: chain of buffers containing message + * @dsz: amount of user data to be sent + * @dnode: address of destination node + * @selector: a number used for deterministic link selection + * Consumes the buffer chain, except when returning -ELINKCONG + * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE */ -int tipc_link_xmit(struct sk_buff *buf, u32 dest, u32 selector) +int tipc_link_xmit(struct sk_buff *buf, u32 dnode, u32 selector) { - struct tipc_link *l_ptr; - struct tipc_node *n_ptr; - int res = -ELINKCONG; + struct tipc_link *link = NULL; + struct tipc_node *node; + int rc = -EHOSTUNREACH; - n_ptr = tipc_node_find(dest); - if (n_ptr) { - tipc_node_lock(n_ptr); - l_ptr = n_ptr->active_links[selector & 1]; - if (l_ptr) - res = __tipc_link_xmit(l_ptr, buf); - else - kfree_skb(buf); - tipc_node_unlock(n_ptr); - } else { - kfree_skb(buf); + node = tipc_node_find(dnode); + if (node) { + tipc_node_lock(node); + link = node->active_links[selector & 1]; + if (link) + rc = __tipc_link_xmit(link, buf); + tipc_node_unlock(node); } - return res; + + if (link) + return rc; + + if (likely(in_own_node(dnode))) + return tipc_sk_rcv(buf); + + kfree_skb_list(buf); + return rc; } /* @@ -858,7 +824,7 @@ int tipc_link_xmit(struct sk_buff *buf, u32 dest, u32 selector) * * Called with node locked */ -static void tipc_link_sync_xmit(struct tipc_link *l) +static void tipc_link_sync_xmit(struct tipc_link *link) { struct sk_buff *buf; struct tipc_msg *msg; @@ -868,10 +834,9 @@ static void tipc_link_sync_xmit(struct tipc_link *l) return; msg = buf_msg(buf); - tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, l->addr); - msg_set_last_bcast(msg, l->owner->bclink.acked); - link_add_chain_to_outqueue(l, buf, 0); - tipc_link_push_queue(l); + tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, link->addr); + msg_set_last_bcast(msg, link->owner->bclink.acked); + __tipc_link_xmit(link, buf); } /* @@ -892,293 +857,6 @@ static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf) } /* - * tipc_link_names_xmit - send name table entries to new neighbor - * - * Send routine for bulk delivery of name table messages when contact - * with a new neighbor occurs. No link congestion checking is performed - * because name table messages *must* be delivered. The messages must be - * small enough not to require fragmentation. - * Called without any locks held. - */ -void tipc_link_names_xmit(struct list_head *message_list, u32 dest) -{ - struct tipc_node *n_ptr; - struct tipc_link *l_ptr; - struct sk_buff *buf; - struct sk_buff *temp_buf; - - if (list_empty(message_list)) - return; - - n_ptr = tipc_node_find(dest); - if (n_ptr) { - tipc_node_lock(n_ptr); - l_ptr = n_ptr->active_links[0]; - if (l_ptr) { - /* convert circular list to linear list */ - ((struct sk_buff *)message_list->prev)->next = NULL; - link_add_chain_to_outqueue(l_ptr, - (struct sk_buff *)message_list->next, 0); - tipc_link_push_queue(l_ptr); - INIT_LIST_HEAD(message_list); - } - tipc_node_unlock(n_ptr); - } - - /* discard the messages if they couldn't be sent */ - list_for_each_safe(buf, temp_buf, ((struct sk_buff *)message_list)) { - list_del((struct list_head *)buf); - kfree_skb(buf); - } -} - -/* - * tipc_link_xmit_fast: Entry for data messages where the - * destination link is known and the header is complete, - * inclusive total message length. Very time critical. - * Link is locked. Returns user data length. - */ -static int tipc_link_xmit_fast(struct tipc_link *l_ptr, struct sk_buff *buf, - u32 *used_max_pkt) -{ - struct tipc_msg *msg = buf_msg(buf); - int res = msg_data_sz(msg); - - if (likely(!link_congested(l_ptr))) { - if (likely(msg_size(msg) <= l_ptr->max_pkt)) { - link_add_to_outqueue(l_ptr, buf, msg); - tipc_bearer_send(l_ptr->bearer_id, buf, - &l_ptr->media_addr); - l_ptr->unacked_window = 0; - return res; - } - else - *used_max_pkt = l_ptr->max_pkt; - } - return __tipc_link_xmit(l_ptr, buf); /* All other cases */ -} - -/* - * tipc_link_iovec_xmit_fast: Entry for messages where the - * destination processor is known and the header is complete, - * except for total message length. - * Returns user data length or errno. - */ -int tipc_link_iovec_xmit_fast(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destaddr) -{ - struct tipc_msg *hdr = &sender->phdr; - struct tipc_link *l_ptr; - struct sk_buff *buf; - struct tipc_node *node; - int res; - u32 selector = msg_origport(hdr) & 1; - -again: - /* - * Try building message using port's max_pkt hint. - * (Must not hold any locks while building message.) - */ - res = tipc_msg_build(hdr, msg_sect, len, sender->max_pkt, &buf); - /* Exit if build request was invalid */ - if (unlikely(res < 0)) - return res; - - node = tipc_node_find(destaddr); - if (likely(node)) { - tipc_node_lock(node); - l_ptr = node->active_links[selector]; - if (likely(l_ptr)) { - if (likely(buf)) { - res = tipc_link_xmit_fast(l_ptr, buf, - &sender->max_pkt); -exit: - tipc_node_unlock(node); - return res; - } - - /* Exit if link (or bearer) is congested */ - if (link_congested(l_ptr)) { - res = link_schedule_port(l_ptr, - sender->ref, res); - goto exit; - } - - /* - * Message size exceeds max_pkt hint; update hint, - * then re-try fast path or fragment the message - */ - sender->max_pkt = l_ptr->max_pkt; - tipc_node_unlock(node); - - - if ((msg_hdr_sz(hdr) + res) <= sender->max_pkt) - goto again; - - return tipc_link_iovec_long_xmit(sender, msg_sect, - len, destaddr); - } - tipc_node_unlock(node); - } - - /* Couldn't find a link to the destination node */ - kfree_skb(buf); - tipc_port_iovec_reject(sender, hdr, msg_sect, len, TIPC_ERR_NO_NODE); - return -ENETUNREACH; -} - -/* - * tipc_link_iovec_long_xmit(): Entry for long messages where the - * destination node is known and the header is complete, - * inclusive total message length. - * Link and bearer congestion status have been checked to be ok, - * and are ignored if they change. - * - * Note that fragments do not use the full link MTU so that they won't have - * to undergo refragmentation if link changeover causes them to be sent - * over another link with an additional tunnel header added as prefix. - * (Refragmentation will still occur if the other link has a smaller MTU.) - * - * Returns user data length or errno. - */ -static int tipc_link_iovec_long_xmit(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destaddr) -{ - struct tipc_link *l_ptr; - struct tipc_node *node; - struct tipc_msg *hdr = &sender->phdr; - u32 dsz = len; - u32 max_pkt, fragm_sz, rest; - struct tipc_msg fragm_hdr; - struct sk_buff *buf, *buf_chain, *prev; - u32 fragm_crs, fragm_rest, hsz, sect_rest; - const unchar __user *sect_crs; - int curr_sect; - u32 fragm_no; - int res = 0; - -again: - fragm_no = 1; - max_pkt = sender->max_pkt - INT_H_SIZE; - /* leave room for tunnel header in case of link changeover */ - fragm_sz = max_pkt - INT_H_SIZE; - /* leave room for fragmentation header in each fragment */ - rest = dsz; - fragm_crs = 0; - fragm_rest = 0; - sect_rest = 0; - sect_crs = NULL; - curr_sect = -1; - - /* Prepare reusable fragment header */ - tipc_msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT, - INT_H_SIZE, msg_destnode(hdr)); - msg_set_size(&fragm_hdr, max_pkt); - msg_set_fragm_no(&fragm_hdr, 1); - - /* Prepare header of first fragment */ - buf_chain = buf = tipc_buf_acquire(max_pkt); - if (!buf) - return -ENOMEM; - buf->next = NULL; - skb_copy_to_linear_data(buf, &fragm_hdr, INT_H_SIZE); - hsz = msg_hdr_sz(hdr); - skb_copy_to_linear_data_offset(buf, INT_H_SIZE, hdr, hsz); - - /* Chop up message */ - fragm_crs = INT_H_SIZE + hsz; - fragm_rest = fragm_sz - hsz; - - do { /* For all sections */ - u32 sz; - - if (!sect_rest) { - sect_rest = msg_sect[++curr_sect].iov_len; - sect_crs = msg_sect[curr_sect].iov_base; - } - - if (sect_rest < fragm_rest) - sz = sect_rest; - else - sz = fragm_rest; - - if (copy_from_user(buf->data + fragm_crs, sect_crs, sz)) { - res = -EFAULT; -error: - kfree_skb_list(buf_chain); - return res; - } - sect_crs += sz; - sect_rest -= sz; - fragm_crs += sz; - fragm_rest -= sz; - rest -= sz; - - if (!fragm_rest && rest) { - - /* Initiate new fragment: */ - if (rest <= fragm_sz) { - fragm_sz = rest; - msg_set_type(&fragm_hdr, LAST_FRAGMENT); - } else { - msg_set_type(&fragm_hdr, FRAGMENT); - } - msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); - msg_set_fragm_no(&fragm_hdr, ++fragm_no); - prev = buf; - buf = tipc_buf_acquire(fragm_sz + INT_H_SIZE); - if (!buf) { - res = -ENOMEM; - goto error; - } - - buf->next = NULL; - prev->next = buf; - skb_copy_to_linear_data(buf, &fragm_hdr, INT_H_SIZE); - fragm_crs = INT_H_SIZE; - fragm_rest = fragm_sz; - } - } while (rest > 0); - - /* - * Now we have a buffer chain. Select a link and check - * that packet size is still OK - */ - node = tipc_node_find(destaddr); - if (likely(node)) { - tipc_node_lock(node); - l_ptr = node->active_links[sender->ref & 1]; - if (!l_ptr) { - tipc_node_unlock(node); - goto reject; - } - if (l_ptr->max_pkt < max_pkt) { - sender->max_pkt = l_ptr->max_pkt; - tipc_node_unlock(node); - kfree_skb_list(buf_chain); - goto again; - } - } else { -reject: - kfree_skb_list(buf_chain); - tipc_port_iovec_reject(sender, hdr, msg_sect, len, - TIPC_ERR_NO_NODE); - return -ENETUNREACH; - } - - /* Append chain of fragments to send queue & send them */ - l_ptr->long_msg_seq_no++; - link_add_chain_to_outqueue(l_ptr, buf_chain, l_ptr->long_msg_seq_no); - l_ptr->stats.sent_fragments += fragm_no; - l_ptr->stats.sent_fragmented++; - tipc_link_push_queue(l_ptr); - tipc_node_unlock(node); - return dsz; -} - -/* * tipc_link_push_packet: Push one unsent packet to the media */ static u32 tipc_link_push_packet(struct tipc_link *l_ptr) @@ -1238,7 +916,7 @@ static u32 tipc_link_push_packet(struct tipc_link *l_ptr) tipc_bearer_send(l_ptr->bearer_id, buf, &l_ptr->media_addr); if (msg_user(msg) == MSG_BUNDLER) - msg_set_type(msg, CLOSED_MSG); + msg_set_type(msg, BUNDLE_CLOSED); l_ptr->next_out = buf->next; return 0; } @@ -1527,11 +1205,6 @@ void tipc_rcv(struct sk_buff *head, struct tipc_bearer *b_ptr) if (unlikely(!list_empty(&l_ptr->waiting_ports))) tipc_link_wakeup_ports(l_ptr, 0); - if (unlikely(++l_ptr->unacked_window >= TIPC_MIN_LINK_WIN)) { - l_ptr->stats.sent_acks++; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); - } - /* Process the incoming packet */ if (unlikely(!link_working_working(l_ptr))) { if (msg_user(msg) == LINK_PROTOCOL) { @@ -1565,57 +1238,19 @@ void tipc_rcv(struct sk_buff *head, struct tipc_bearer *b_ptr) if (unlikely(l_ptr->oldest_deferred_in)) head = link_insert_deferred_queue(l_ptr, head); - /* Deliver packet/message to correct user: */ - if (unlikely(msg_user(msg) == CHANGEOVER_PROTOCOL)) { - if (!tipc_link_tunnel_rcv(n_ptr, &buf)) { - tipc_node_unlock(n_ptr); - continue; - } - msg = buf_msg(buf); - } else if (msg_user(msg) == MSG_FRAGMENTER) { - l_ptr->stats.recv_fragments++; - if (tipc_buf_append(&l_ptr->reasm_buf, &buf)) { - l_ptr->stats.recv_fragmented++; - msg = buf_msg(buf); - } else { - if (!l_ptr->reasm_buf) - tipc_link_reset(l_ptr); - tipc_node_unlock(n_ptr); - continue; - } + if (unlikely(++l_ptr->unacked_window >= TIPC_MIN_LINK_WIN)) { + l_ptr->stats.sent_acks++; + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); } - switch (msg_user(msg)) { - case TIPC_LOW_IMPORTANCE: - case TIPC_MEDIUM_IMPORTANCE: - case TIPC_HIGH_IMPORTANCE: - case TIPC_CRITICAL_IMPORTANCE: + if (tipc_link_prepare_input(l_ptr, &buf)) { tipc_node_unlock(n_ptr); - tipc_sk_rcv(buf); continue; - case MSG_BUNDLER: - l_ptr->stats.recv_bundles++; - l_ptr->stats.recv_bundled += msg_msgcnt(msg); - tipc_node_unlock(n_ptr); - tipc_link_bundle_rcv(buf); - continue; - case NAME_DISTRIBUTOR: - n_ptr->bclink.recv_permitted = true; - tipc_node_unlock(n_ptr); - tipc_named_rcv(buf); - continue; - case CONN_MANAGER: - tipc_node_unlock(n_ptr); - tipc_port_proto_rcv(buf); - continue; - case BCAST_PROTOCOL: - tipc_link_sync_rcv(n_ptr, buf); - break; - default: - kfree_skb(buf); - break; } tipc_node_unlock(n_ptr); + msg = buf_msg(buf); + if (tipc_link_input(l_ptr, buf) != 0) + goto discard; continue; unlock_discard: tipc_node_unlock(n_ptr); @@ -1625,6 +1260,80 @@ discard: } /** + * tipc_link_prepare_input - process TIPC link messages + * + * returns nonzero if the message was consumed + * + * Node lock must be held + */ +static int tipc_link_prepare_input(struct tipc_link *l, struct sk_buff **buf) +{ + struct tipc_node *n; + struct tipc_msg *msg; + int res = -EINVAL; + + n = l->owner; + msg = buf_msg(*buf); + switch (msg_user(msg)) { + case CHANGEOVER_PROTOCOL: + if (tipc_link_tunnel_rcv(n, buf)) + res = 0; + break; + case MSG_FRAGMENTER: + l->stats.recv_fragments++; + if (tipc_buf_append(&l->reasm_buf, buf)) { + l->stats.recv_fragmented++; + res = 0; + } else if (!l->reasm_buf) { + tipc_link_reset(l); + } + break; + case MSG_BUNDLER: + l->stats.recv_bundles++; + l->stats.recv_bundled += msg_msgcnt(msg); + res = 0; + break; + case NAME_DISTRIBUTOR: + n->bclink.recv_permitted = true; + res = 0; + break; + case BCAST_PROTOCOL: + tipc_link_sync_rcv(n, *buf); + break; + default: + res = 0; + } + return res; +} +/** + * tipc_link_input - Deliver message too higher layers + */ +static int tipc_link_input(struct tipc_link *l, struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + int res = 0; + + switch (msg_user(msg)) { + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + case CONN_MANAGER: + tipc_sk_rcv(buf); + break; + case NAME_DISTRIBUTOR: + tipc_named_rcv(buf); + break; + case MSG_BUNDLER: + tipc_link_bundle_rcv(buf); + break; + default: + res = -EINVAL; + } + return res; +} + +/** * tipc_link_defer_pkt - Add out-of-sequence message to deferred reception queue * * Returns increase in queue length (i.e. 0 or 1) @@ -2217,6 +1926,7 @@ void tipc_link_bundle_rcv(struct sk_buff *buf) u32 msgcount = msg_msgcnt(buf_msg(buf)); u32 pos = INT_H_SIZE; struct sk_buff *obuf; + struct tipc_msg *omsg; while (msgcount--) { obuf = buf_extract(buf, pos); @@ -2224,82 +1934,18 @@ void tipc_link_bundle_rcv(struct sk_buff *buf) pr_warn("Link unable to unbundle message(s)\n"); break; } - pos += align(msg_size(buf_msg(obuf))); - tipc_net_route_msg(obuf); - } - kfree_skb(buf); -} - -/* - * Fragmentation/defragmentation: - */ - -/* - * tipc_link_frag_xmit: Entry for buffers needing fragmentation. - * The buffer is complete, inclusive total message length. - * Returns user data length. - */ -static int tipc_link_frag_xmit(struct tipc_link *l_ptr, struct sk_buff *buf) -{ - struct sk_buff *buf_chain = NULL; - struct sk_buff *buf_chain_tail = (struct sk_buff *)&buf_chain; - struct tipc_msg *inmsg = buf_msg(buf); - struct tipc_msg fragm_hdr; - u32 insize = msg_size(inmsg); - u32 dsz = msg_data_sz(inmsg); - unchar *crs = buf->data; - u32 rest = insize; - u32 pack_sz = l_ptr->max_pkt; - u32 fragm_sz = pack_sz - INT_H_SIZE; - u32 fragm_no = 0; - u32 destaddr; - - if (msg_short(inmsg)) - destaddr = l_ptr->addr; - else - destaddr = msg_destnode(inmsg); - - /* Prepare reusable fragment header: */ - tipc_msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT, - INT_H_SIZE, destaddr); - - /* Chop up message: */ - while (rest > 0) { - struct sk_buff *fragm; - - if (rest <= fragm_sz) { - fragm_sz = rest; - msg_set_type(&fragm_hdr, LAST_FRAGMENT); - } - fragm = tipc_buf_acquire(fragm_sz + INT_H_SIZE); - if (fragm == NULL) { - kfree_skb(buf); - kfree_skb_list(buf_chain); - return -ENOMEM; + omsg = buf_msg(obuf); + pos += align(msg_size(omsg)); + if (msg_isdata(omsg) || (msg_user(omsg) == CONN_MANAGER)) { + tipc_sk_rcv(obuf); + } else if (msg_user(omsg) == NAME_DISTRIBUTOR) { + tipc_named_rcv(obuf); + } else { + pr_warn("Illegal bundled msg: %u\n", msg_user(omsg)); + kfree_skb(obuf); } - msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); - fragm_no++; - msg_set_fragm_no(&fragm_hdr, fragm_no); - skb_copy_to_linear_data(fragm, &fragm_hdr, INT_H_SIZE); - skb_copy_to_linear_data_offset(fragm, INT_H_SIZE, crs, - fragm_sz); - buf_chain_tail->next = fragm; - buf_chain_tail = fragm; - - rest -= fragm_sz; - crs += fragm_sz; - msg_set_type(&fragm_hdr, FRAGMENT); } kfree_skb(buf); - - /* Append chain of fragments to send queue & send them */ - l_ptr->long_msg_seq_no++; - link_add_chain_to_outqueue(l_ptr, buf_chain, l_ptr->long_msg_seq_no); - l_ptr->stats.sent_fragments += fragm_no; - l_ptr->stats.sent_fragmented++; - tipc_link_push_queue(l_ptr); - - return dsz; } static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tolerance) diff --git a/net/tipc/link.h b/net/tipc/link.h index 200d518b218e..782983ccd323 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -227,13 +227,8 @@ void tipc_link_reset_all(struct tipc_node *node); void tipc_link_reset(struct tipc_link *l_ptr); void tipc_link_reset_list(unsigned int bearer_id); int tipc_link_xmit(struct sk_buff *buf, u32 dest, u32 selector); -void tipc_link_names_xmit(struct list_head *message_list, u32 dest); -int __tipc_link_xmit(struct tipc_link *l_ptr, struct sk_buff *buf); -int tipc_link_send_buf(struct tipc_link *l_ptr, struct sk_buff *buf); +int __tipc_link_xmit(struct tipc_link *link, struct sk_buff *buf); u32 tipc_link_get_max_pkt(u32 dest, u32 selector); -int tipc_link_iovec_xmit_fast(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destnode); void tipc_link_bundle_rcv(struct sk_buff *buf); void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob, u32 gap, u32 tolerance, u32 priority, u32 acked_mtu); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 8be6e94a1ca9..b6f45d029933 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -36,21 +36,16 @@ #include "core.h" #include "msg.h" +#include "addr.h" +#include "name_table.h" -u32 tipc_msg_tot_importance(struct tipc_msg *m) +#define MAX_FORWARD_SIZE 1024 + +static unsigned int align(unsigned int i) { - if (likely(msg_isdata(m))) { - if (likely(msg_orignode(m) == tipc_own_addr)) - return msg_importance(m); - return msg_importance(m) + 4; - } - if ((msg_user(m) == MSG_FRAGMENTER) && - (msg_type(m) == FIRST_FRAGMENT)) - return msg_importance(msg_get_wrapped(m)); - return msg_importance(m); + return (i + 3) & ~3u; } - void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode) { @@ -65,45 +60,12 @@ void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type, u32 hsize, msg_set_destnode(m, destnode); } -/** - * tipc_msg_build - create message using specified header and data - * - * Note: Caller must not hold any locks in case copy_from_user() is interrupted! - * - * Returns message data size or errno - */ -int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect, - unsigned int len, int max_size, struct sk_buff **buf) -{ - int dsz, sz, hsz; - unsigned char *to; - - dsz = len; - hsz = msg_hdr_sz(hdr); - sz = hsz + dsz; - msg_set_size(hdr, sz); - if (unlikely(sz > max_size)) { - *buf = NULL; - return dsz; - } - - *buf = tipc_buf_acquire(sz); - if (!(*buf)) - return -ENOMEM; - skb_copy_to_linear_data(*buf, hdr, hsz); - to = (*buf)->data + hsz; - if (len && memcpy_fromiovecend(to, msg_sect, 0, dsz)) { - kfree_skb(*buf); - *buf = NULL; - return -EFAULT; - } - return dsz; -} - /* tipc_buf_append(): Append a buffer to the fragment list of another buffer - * Let first buffer become head buffer - * Returns 1 and sets *buf to headbuf if chain is complete, otherwise 0 - * Leaves headbuf pointer at NULL if failure + * @*headbuf: in: NULL for first frag, otherwise value returned from prev call + * out: set when successful non-complete reassembly, otherwise NULL + * @*buf: in: the buffer to append. Always defined + * out: head buf after sucessful complete reassembly, otherwise NULL + * Returns 1 when reassembly complete, otherwise 0 */ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) { @@ -122,6 +84,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) goto out_free; head = *headbuf = frag; skb_frag_list_init(head); + *buf = NULL; return 0; } if (!head) @@ -150,5 +113,307 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) out_free: pr_warn_ratelimited("Unable to build fragment list\n"); kfree_skb(*buf); + kfree_skb(*headbuf); + *buf = *headbuf = NULL; return 0; } + + +/** + * tipc_msg_build - create buffer chain containing specified header and data + * @mhdr: Message header, to be prepended to data + * @iov: User data + * @offset: Posision in iov to start copying from + * @dsz: Total length of user data + * @pktmax: Max packet size that can be used + * @chain: Buffer or chain of buffers to be returned to caller + * Returns message data size or errno: -ENOMEM, -EFAULT + */ +int tipc_msg_build(struct tipc_msg *mhdr, struct iovec const *iov, + int offset, int dsz, int pktmax , struct sk_buff **chain) +{ + int mhsz = msg_hdr_sz(mhdr); + int msz = mhsz + dsz; + int pktno = 1; + int pktsz; + int pktrem = pktmax; + int drem = dsz; + struct tipc_msg pkthdr; + struct sk_buff *buf, *prev; + char *pktpos; + int rc; + + msg_set_size(mhdr, msz); + + /* No fragmentation needed? */ + if (likely(msz <= pktmax)) { + buf = tipc_buf_acquire(msz); + *chain = buf; + if (unlikely(!buf)) + return -ENOMEM; + skb_copy_to_linear_data(buf, mhdr, mhsz); + pktpos = buf->data + mhsz; + if (!dsz || !memcpy_fromiovecend(pktpos, iov, offset, dsz)) + return dsz; + rc = -EFAULT; + goto error; + } + + /* Prepare reusable fragment header */ + tipc_msg_init(&pkthdr, MSG_FRAGMENTER, FIRST_FRAGMENT, + INT_H_SIZE, msg_destnode(mhdr)); + msg_set_size(&pkthdr, pktmax); + msg_set_fragm_no(&pkthdr, pktno); + + /* Prepare first fragment */ + *chain = buf = tipc_buf_acquire(pktmax); + if (!buf) + return -ENOMEM; + pktpos = buf->data; + skb_copy_to_linear_data(buf, &pkthdr, INT_H_SIZE); + pktpos += INT_H_SIZE; + pktrem -= INT_H_SIZE; + skb_copy_to_linear_data_offset(buf, INT_H_SIZE, mhdr, mhsz); + pktpos += mhsz; + pktrem -= mhsz; + + do { + if (drem < pktrem) + pktrem = drem; + + if (memcpy_fromiovecend(pktpos, iov, offset, pktrem)) { + rc = -EFAULT; + goto error; + } + drem -= pktrem; + offset += pktrem; + + if (!drem) + break; + + /* Prepare new fragment: */ + if (drem < (pktmax - INT_H_SIZE)) + pktsz = drem + INT_H_SIZE; + else + pktsz = pktmax; + prev = buf; + buf = tipc_buf_acquire(pktsz); + if (!buf) { + rc = -ENOMEM; + goto error; + } + prev->next = buf; + msg_set_type(&pkthdr, FRAGMENT); + msg_set_size(&pkthdr, pktsz); + msg_set_fragm_no(&pkthdr, ++pktno); + skb_copy_to_linear_data(buf, &pkthdr, INT_H_SIZE); + pktpos = buf->data + INT_H_SIZE; + pktrem = pktsz - INT_H_SIZE; + + } while (1); + + msg_set_type(buf_msg(buf), LAST_FRAGMENT); + return dsz; +error: + kfree_skb_list(*chain); + *chain = NULL; + return rc; +} + +/** + * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one + * @bbuf: the existing buffer ("bundle") + * @buf: buffer to be appended + * @mtu: max allowable size for the bundle buffer + * Consumes buffer if successful + * Returns true if bundling could be performed, otherwise false + */ +bool tipc_msg_bundle(struct sk_buff *bbuf, struct sk_buff *buf, u32 mtu) +{ + struct tipc_msg *bmsg = buf_msg(bbuf); + struct tipc_msg *msg = buf_msg(buf); + unsigned int bsz = msg_size(bmsg); + unsigned int msz = msg_size(msg); + u32 start = align(bsz); + u32 max = mtu - INT_H_SIZE; + u32 pad = start - bsz; + + if (likely(msg_user(msg) == MSG_FRAGMENTER)) + return false; + if (unlikely(msg_user(msg) == CHANGEOVER_PROTOCOL)) + return false; + if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) + return false; + if (likely(msg_user(bmsg) != MSG_BUNDLER)) + return false; + if (likely(msg_type(bmsg) != BUNDLE_OPEN)) + return false; + if (unlikely(skb_tailroom(bbuf) < (pad + msz))) + return false; + if (unlikely(max < (start + msz))) + return false; + + skb_put(bbuf, pad + msz); + skb_copy_to_linear_data_offset(bbuf, start, buf->data, msz); + msg_set_size(bmsg, start + msz); + msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); + bbuf->next = buf->next; + kfree_skb(buf); + return true; +} + +/** + * tipc_msg_make_bundle(): Create bundle buf and append message to its tail + * @buf: buffer to be appended and replaced + * @mtu: max allowable size for the bundle buffer, inclusive header + * @dnode: destination node for message. (Not always present in header) + * Replaces buffer if successful + * Returns true if sucess, otherwise false + */ +bool tipc_msg_make_bundle(struct sk_buff **buf, u32 mtu, u32 dnode) +{ + struct sk_buff *bbuf; + struct tipc_msg *bmsg; + struct tipc_msg *msg = buf_msg(*buf); + u32 msz = msg_size(msg); + u32 max = mtu - INT_H_SIZE; + + if (msg_user(msg) == MSG_FRAGMENTER) + return false; + if (msg_user(msg) == CHANGEOVER_PROTOCOL) + return false; + if (msg_user(msg) == BCAST_PROTOCOL) + return false; + if (msz > (max / 2)) + return false; + + bbuf = tipc_buf_acquire(max); + if (!bbuf) + return false; + + skb_trim(bbuf, INT_H_SIZE); + bmsg = buf_msg(bbuf); + tipc_msg_init(bmsg, MSG_BUNDLER, BUNDLE_OPEN, INT_H_SIZE, dnode); + msg_set_seqno(bmsg, msg_seqno(msg)); + msg_set_ack(bmsg, msg_ack(msg)); + msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); + bbuf->next = (*buf)->next; + tipc_msg_bundle(bbuf, *buf, mtu); + *buf = bbuf; + return true; +} + +/** + * tipc_msg_reverse(): swap source and destination addresses and add error code + * @buf: buffer containing message to be reversed + * @dnode: return value: node where to send message after reversal + * @err: error code to be set in message + * Consumes buffer if failure + * Returns true if success, otherwise false + */ +bool tipc_msg_reverse(struct sk_buff *buf, u32 *dnode, int err) +{ + struct tipc_msg *msg = buf_msg(buf); + uint imp = msg_importance(msg); + struct tipc_msg ohdr; + uint rdsz = min_t(uint, msg_data_sz(msg), MAX_FORWARD_SIZE); + + if (skb_linearize(buf)) + goto exit; + if (msg_dest_droppable(msg)) + goto exit; + if (msg_errcode(msg)) + goto exit; + + memcpy(&ohdr, msg, msg_hdr_sz(msg)); + imp = min_t(uint, imp + 1, TIPC_CRITICAL_IMPORTANCE); + if (msg_isdata(msg)) + msg_set_importance(msg, imp); + msg_set_errcode(msg, err); + msg_set_origport(msg, msg_destport(&ohdr)); + msg_set_destport(msg, msg_origport(&ohdr)); + msg_set_prevnode(msg, tipc_own_addr); + if (!msg_short(msg)) { + msg_set_orignode(msg, msg_destnode(&ohdr)); + msg_set_destnode(msg, msg_orignode(&ohdr)); + } + msg_set_size(msg, msg_hdr_sz(msg) + rdsz); + skb_trim(buf, msg_size(msg)); + skb_orphan(buf); + *dnode = msg_orignode(&ohdr); + return true; +exit: + kfree_skb(buf); + return false; +} + +/** + * tipc_msg_eval: determine fate of message that found no destination + * @buf: the buffer containing the message. + * @dnode: return value: next-hop node, if message to be forwarded + * @err: error code to use, if message to be rejected + * + * Does not consume buffer + * Returns 0 (TIPC_OK) if message ok and we can try again, -TIPC error + * code if message to be rejected + */ +int tipc_msg_eval(struct sk_buff *buf, u32 *dnode) +{ + struct tipc_msg *msg = buf_msg(buf); + u32 dport; + + if (msg_type(msg) != TIPC_NAMED_MSG) + return -TIPC_ERR_NO_PORT; + if (skb_linearize(buf)) + return -TIPC_ERR_NO_NAME; + if (msg_data_sz(msg) > MAX_FORWARD_SIZE) + return -TIPC_ERR_NO_NAME; + if (msg_reroute_cnt(msg) > 0) + return -TIPC_ERR_NO_NAME; + + *dnode = addr_domain(msg_lookup_scope(msg)); + dport = tipc_nametbl_translate(msg_nametype(msg), + msg_nameinst(msg), + dnode); + if (!dport) + return -TIPC_ERR_NO_NAME; + msg_incr_reroute_cnt(msg); + msg_set_destnode(msg, *dnode); + msg_set_destport(msg, dport); + return TIPC_OK; +} + +/* tipc_msg_reassemble() - clone a buffer chain of fragments and + * reassemble the clones into one message + */ +struct sk_buff *tipc_msg_reassemble(struct sk_buff *chain) +{ + struct sk_buff *buf = chain; + struct sk_buff *frag = buf; + struct sk_buff *head = NULL; + int hdr_sz; + + /* Copy header if single buffer */ + if (!buf->next) { + hdr_sz = skb_headroom(buf) + msg_hdr_sz(buf_msg(buf)); + return __pskb_copy(buf, hdr_sz, GFP_ATOMIC); + } + + /* Clone all fragments and reassemble */ + while (buf) { + frag = skb_clone(buf, GFP_ATOMIC); + if (!frag) + goto error; + frag->next = NULL; + if (tipc_buf_append(&head, &frag)) + break; + if (!head) + goto error; + buf = buf->next; + } + return frag; +error: + pr_warn("Failed do clone local mcast rcv buffer\n"); + kfree_skb(head); + return NULL; +} diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 503511903d1d..462fa194a6af 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -463,6 +463,11 @@ static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) #define FRAGMENT 1 #define LAST_FRAGMENT 2 +/* Bundling protocol message types + */ +#define BUNDLE_OPEN 0 +#define BUNDLE_CLOSED 1 + /* * Link management protocol message types */ @@ -706,12 +711,36 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } -u32 tipc_msg_tot_importance(struct tipc_msg *m); +static inline u32 tipc_msg_tot_importance(struct tipc_msg *m) +{ + if ((msg_user(m) == MSG_FRAGMENTER) && (msg_type(m) == FIRST_FRAGMENT)) + return msg_importance(msg_get_wrapped(m)); + return msg_importance(m); +} + +static inline u32 msg_tot_origport(struct tipc_msg *m) +{ + if ((msg_user(m) == MSG_FRAGMENTER) && (msg_type(m) == FIRST_FRAGMENT)) + return msg_origport(msg_get_wrapped(m)); + return msg_origport(m); +} + +bool tipc_msg_reverse(struct sk_buff *buf, u32 *dnode, int err); + +int tipc_msg_eval(struct sk_buff *buf, u32 *dnode); + void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); -int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect, - unsigned int len, int max_size, struct sk_buff **buf); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); +bool tipc_msg_bundle(struct sk_buff *bbuf, struct sk_buff *buf, u32 mtu); + +bool tipc_msg_make_bundle(struct sk_buff **buf, u32 mtu, u32 dnode); + +int tipc_msg_build(struct tipc_msg *mhdr, struct iovec const *iov, + int offset, int dsz, int mtu , struct sk_buff **chain); + +struct sk_buff *tipc_msg_reassemble(struct sk_buff *chain); + #endif diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 8ce730984aa1..dcc15bcd5692 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -101,24 +101,22 @@ static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest) void named_cluster_distribute(struct sk_buff *buf) { - struct sk_buff *buf_copy; - struct tipc_node *n_ptr; - struct tipc_link *l_ptr; + struct sk_buff *obuf; + struct tipc_node *node; + u32 dnode; rcu_read_lock(); - list_for_each_entry_rcu(n_ptr, &tipc_node_list, list) { - tipc_node_lock(n_ptr); - l_ptr = n_ptr->active_links[n_ptr->addr & 1]; - if (l_ptr) { - buf_copy = skb_copy(buf, GFP_ATOMIC); - if (!buf_copy) { - tipc_node_unlock(n_ptr); - break; - } - msg_set_destnode(buf_msg(buf_copy), n_ptr->addr); - __tipc_link_xmit(l_ptr, buf_copy); - } - tipc_node_unlock(n_ptr); + list_for_each_entry_rcu(node, &tipc_node_list, list) { + dnode = node->addr; + if (in_own_node(dnode)) + continue; + if (!tipc_node_active_links(node)) + continue; + obuf = skb_copy(buf, GFP_ATOMIC); + if (!obuf) + break; + msg_set_destnode(buf_msg(obuf), dnode); + tipc_link_xmit(obuf, dnode, dnode); } rcu_read_unlock(); @@ -175,34 +173,44 @@ struct sk_buff *tipc_named_withdraw(struct publication *publ) return buf; } -/* +/** * named_distribute - prepare name info for bulk distribution to another node + * @msg_list: list of messages (buffers) to be returned from this function + * @dnode: node to be updated + * @pls: linked list of publication items to be packed into buffer chain */ -static void named_distribute(struct list_head *message_list, u32 node, - struct publ_list *pls, u32 max_item_buf) +static void named_distribute(struct list_head *msg_list, u32 dnode, + struct publ_list *pls) { struct publication *publ; struct sk_buff *buf = NULL; struct distr_item *item = NULL; - u32 left = 0; - u32 rest = pls->size * ITEM_SIZE; + uint dsz = pls->size * ITEM_SIZE; + uint msg_dsz = (tipc_node_get_mtu(dnode, 0) / ITEM_SIZE) * ITEM_SIZE; + uint rem = dsz; + uint msg_rem = 0; list_for_each_entry(publ, &pls->list, local_list) { + /* Prepare next buffer: */ if (!buf) { - left = (rest <= max_item_buf) ? rest : max_item_buf; - rest -= left; - buf = named_prepare_buf(PUBLICATION, left, node); + msg_rem = min_t(uint, rem, msg_dsz); + rem -= msg_rem; + buf = named_prepare_buf(PUBLICATION, msg_rem, dnode); if (!buf) { pr_warn("Bulk publication failure\n"); return; } item = (struct distr_item *)msg_data(buf_msg(buf)); } + + /* Pack publication into message: */ publ_to_item(item, publ); item++; - left -= ITEM_SIZE; - if (!left) { - list_add_tail((struct list_head *)buf, message_list); + msg_rem -= ITEM_SIZE; + + /* Append full buffer to list: */ + if (!msg_rem) { + list_add_tail((struct list_head *)buf, msg_list); buf = NULL; } } @@ -211,16 +219,20 @@ static void named_distribute(struct list_head *message_list, u32 node, /** * tipc_named_node_up - tell specified node about all publications by this node */ -void tipc_named_node_up(u32 max_item_buf, u32 node) +void tipc_named_node_up(u32 dnode) { - LIST_HEAD(message_list); + LIST_HEAD(msg_list); + struct sk_buff *buf_chain; read_lock_bh(&tipc_nametbl_lock); - named_distribute(&message_list, node, &publ_cluster, max_item_buf); - named_distribute(&message_list, node, &publ_zone, max_item_buf); + named_distribute(&msg_list, dnode, &publ_cluster); + named_distribute(&msg_list, dnode, &publ_zone); read_unlock_bh(&tipc_nametbl_lock); - tipc_link_names_xmit(&message_list, node); + /* Convert circular list to linear list and send: */ + buf_chain = (struct sk_buff *)msg_list.next; + ((struct sk_buff *)msg_list.prev)->next = NULL; + tipc_link_xmit(buf_chain, dnode, dnode); } /** diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index b2eed4ec1526..8afe32b7fc9a 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -70,7 +70,7 @@ struct distr_item { struct sk_buff *tipc_named_publish(struct publication *publ); struct sk_buff *tipc_named_withdraw(struct publication *publ); void named_cluster_distribute(struct sk_buff *buf); -void tipc_named_node_up(u32 max_item_buf, u32 node); +void tipc_named_node_up(u32 dnode); void tipc_named_rcv(struct sk_buff *buf); void tipc_named_reinit(void); diff --git a/net/tipc/net.c b/net/tipc/net.c index f64375e7f99f..7fcc94998fea 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -1,7 +1,7 @@ /* * net/tipc/net.c: TIPC network routing code * - * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 1995-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -104,67 +104,6 @@ * - A local spin_lock protecting the queue of subscriber events. */ -static void net_route_named_msg(struct sk_buff *buf) -{ - struct tipc_msg *msg = buf_msg(buf); - u32 dnode; - u32 dport; - - if (!msg_named(msg)) { - kfree_skb(buf); - return; - } - - dnode = addr_domain(msg_lookup_scope(msg)); - dport = tipc_nametbl_translate(msg_nametype(msg), msg_nameinst(msg), &dnode); - if (dport) { - msg_set_destnode(msg, dnode); - msg_set_destport(msg, dport); - tipc_net_route_msg(buf); - return; - } - tipc_reject_msg(buf, TIPC_ERR_NO_NAME); -} - -void tipc_net_route_msg(struct sk_buff *buf) -{ - struct tipc_msg *msg; - u32 dnode; - - if (!buf) - return; - msg = buf_msg(buf); - - /* Handle message for this node */ - dnode = msg_short(msg) ? tipc_own_addr : msg_destnode(msg); - if (tipc_in_scope(dnode, tipc_own_addr)) { - if (msg_isdata(msg)) { - if (msg_mcast(msg)) - tipc_port_mcast_rcv(buf, NULL); - else if (msg_destport(msg)) - tipc_sk_rcv(buf); - else - net_route_named_msg(buf); - return; - } - switch (msg_user(msg)) { - case NAME_DISTRIBUTOR: - tipc_named_rcv(buf); - break; - case CONN_MANAGER: - tipc_port_proto_rcv(buf); - break; - default: - kfree_skb(buf); - } - return; - } - - /* Handle message for another node */ - skb_trim(buf, msg_size(msg)); - tipc_link_xmit(buf, dnode, msg_link_selector(msg)); -} - int tipc_net_start(u32 addr) { char addr_string[16]; diff --git a/net/tipc/net.h b/net/tipc/net.h index c6c2b46f7c28..59ef3388be2c 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -37,8 +37,6 @@ #ifndef _TIPC_NET_H #define _TIPC_NET_H -void tipc_net_route_msg(struct sk_buff *buf); - int tipc_net_start(u32 addr); void tipc_net_stop(void); diff --git a/net/tipc/node.c b/net/tipc/node.c index 5b44c3041be4..f7069299943f 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1,7 +1,7 @@ /* * net/tipc/node.c: TIPC node management routines * - * Copyright (c) 2000-2006, 2012 Ericsson AB + * Copyright (c) 2000-2006, 2012-2014, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * @@ -155,21 +155,25 @@ void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr) if (!active[0]) { active[0] = active[1] = l_ptr; node_established_contact(n_ptr); - return; + goto exit; } if (l_ptr->priority < active[0]->priority) { pr_info("New link <%s> becomes standby\n", l_ptr->name); - return; + goto exit; } tipc_link_dup_queue_xmit(active[0], l_ptr); if (l_ptr->priority == active[0]->priority) { active[0] = l_ptr; - return; + goto exit; } pr_info("Old link <%s> becomes standby\n", active[0]->name); if (active[1] != active[0]) pr_info("Old link <%s> becomes standby\n", active[1]->name); active[0] = active[1] = l_ptr; +exit: + /* Leave room for changeover header when returning 'mtu' to users: */ + n_ptr->act_mtus[0] = active[0]->max_pkt - INT_H_SIZE; + n_ptr->act_mtus[1] = active[1]->max_pkt - INT_H_SIZE; } /** @@ -229,6 +233,19 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr) tipc_link_failover_send_queue(l_ptr); else node_lost_contact(n_ptr); + + /* Leave room for changeover header when returning 'mtu' to users: */ + if (active[0]) { + n_ptr->act_mtus[0] = active[0]->max_pkt - INT_H_SIZE; + n_ptr->act_mtus[1] = active[1]->max_pkt - INT_H_SIZE; + return; + } + + /* Loopback link went down? No fragmentation needed from now on. */ + if (n_ptr->addr == tipc_own_addr) { + n_ptr->act_mtus[0] = MAX_MSG_SIZE; + n_ptr->act_mtus[1] = MAX_MSG_SIZE; + } } int tipc_node_active_links(struct tipc_node *n_ptr) @@ -457,8 +474,6 @@ int tipc_node_get_linkname(u32 bearer_id, u32 addr, char *linkname, size_t len) void tipc_node_unlock(struct tipc_node *node) { LIST_HEAD(nsub_list); - struct tipc_link *link; - int pkt_sz = 0; u32 addr = 0; if (likely(!node->action_flags)) { @@ -471,18 +486,13 @@ void tipc_node_unlock(struct tipc_node *node) node->action_flags &= ~TIPC_NOTIFY_NODE_DOWN; } if (node->action_flags & TIPC_NOTIFY_NODE_UP) { - link = node->active_links[0]; node->action_flags &= ~TIPC_NOTIFY_NODE_UP; - if (link) { - pkt_sz = ((link->max_pkt - INT_H_SIZE) / ITEM_SIZE) * - ITEM_SIZE; - addr = node->addr; - } + addr = node->addr; } spin_unlock_bh(&node->lock); if (!list_empty(&nsub_list)) tipc_nodesub_notify(&nsub_list); - if (pkt_sz) - tipc_named_node_up(pkt_sz, addr); + if (addr) + tipc_named_node_up(addr); } diff --git a/net/tipc/node.h b/net/tipc/node.h index 9087063793f2..b61716a8218e 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -41,6 +41,7 @@ #include "addr.h" #include "net.h" #include "bearer.h" +#include "msg.h" /* * Out-of-range value for node signature @@ -105,6 +106,7 @@ struct tipc_node { spinlock_t lock; struct hlist_node hash; struct tipc_link *active_links[2]; + u32 act_mtus[2]; struct tipc_link *links[MAX_BEARERS]; unsigned int action_flags; struct tipc_node_bclink bclink; @@ -143,4 +145,19 @@ static inline bool tipc_node_blocked(struct tipc_node *node) TIPC_NOTIFY_NODE_DOWN | TIPC_WAIT_OWN_LINKS_DOWN)); } +static inline uint tipc_node_get_mtu(u32 addr, u32 selector) +{ + struct tipc_node *node; + u32 mtu; + + node = tipc_node_find(addr); + + if (likely(node)) + mtu = node->act_mtus[selector & 1]; + else + mtu = MAX_MSG_SIZE; + + return mtu; +} + #endif diff --git a/net/tipc/node_subscr.c b/net/tipc/node_subscr.c index 7c59ab1d6ecb..2d13eea8574a 100644 --- a/net/tipc/node_subscr.c +++ b/net/tipc/node_subscr.c @@ -84,11 +84,13 @@ void tipc_nodesub_unsubscribe(struct tipc_node_subscr *node_sub) void tipc_nodesub_notify(struct list_head *nsub_list) { struct tipc_node_subscr *ns, *safe; + net_ev_handler handle_node_down; list_for_each_entry_safe(ns, safe, nsub_list, nodesub_list) { - if (ns->handle_node_down) { - ns->handle_node_down(ns->usr_handle); + handle_node_down = ns->handle_node_down; + if (handle_node_down) { ns->handle_node_down = NULL; + handle_node_down(ns->usr_handle); } } } diff --git a/net/tipc/port.c b/net/tipc/port.c index 5fd7acce01ea..7e096a5e7701 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -42,8 +42,6 @@ /* Connection management: */ #define PROBING_INTERVAL 3600000 /* [ms] => 1 h */ -#define CONFIRMED 0 -#define PROBING 1 #define MAX_REJECT_SIZE 1024 @@ -76,124 +74,6 @@ int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg) (!peernode && (orignode == tipc_own_addr)); } -/** - * tipc_port_mcast_xmit - send a multicast message to local and remote - * destinations - */ -int tipc_port_mcast_xmit(struct tipc_port *oport, - struct tipc_name_seq const *seq, - struct iovec const *msg_sect, - unsigned int len) -{ - struct tipc_msg *hdr; - struct sk_buff *buf; - struct sk_buff *ibuf = NULL; - struct tipc_port_list dports = {0, NULL, }; - int ext_targets; - int res; - - /* Create multicast message */ - hdr = &oport->phdr; - msg_set_type(hdr, TIPC_MCAST_MSG); - msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE); - msg_set_destport(hdr, 0); - msg_set_destnode(hdr, 0); - msg_set_nametype(hdr, seq->type); - msg_set_namelower(hdr, seq->lower); - msg_set_nameupper(hdr, seq->upper); - msg_set_hdr_sz(hdr, MCAST_H_SIZE); - res = tipc_msg_build(hdr, msg_sect, len, MAX_MSG_SIZE, &buf); - if (unlikely(!buf)) - return res; - - /* Figure out where to send multicast message */ - ext_targets = tipc_nametbl_mc_translate(seq->type, seq->lower, seq->upper, - TIPC_NODE_SCOPE, &dports); - - /* Send message to destinations (duplicate it only if necessary) */ - if (ext_targets) { - if (dports.count != 0) { - ibuf = skb_copy(buf, GFP_ATOMIC); - if (ibuf == NULL) { - tipc_port_list_free(&dports); - kfree_skb(buf); - return -ENOMEM; - } - } - res = tipc_bclink_xmit(buf); - if ((res < 0) && (dports.count != 0)) - kfree_skb(ibuf); - } else { - ibuf = buf; - } - - if (res >= 0) { - if (ibuf) - tipc_port_mcast_rcv(ibuf, &dports); - } else { - tipc_port_list_free(&dports); - } - return res; -} - -/** - * tipc_port_mcast_rcv - deliver multicast message to all destination ports - * - * If there is no port list, perform a lookup to create one - */ -void tipc_port_mcast_rcv(struct sk_buff *buf, struct tipc_port_list *dp) -{ - struct tipc_msg *msg; - struct tipc_port_list dports = {0, NULL, }; - struct tipc_port_list *item = dp; - int cnt = 0; - - msg = buf_msg(buf); - - /* Create destination port list, if one wasn't supplied */ - if (dp == NULL) { - tipc_nametbl_mc_translate(msg_nametype(msg), - msg_namelower(msg), - msg_nameupper(msg), - TIPC_CLUSTER_SCOPE, - &dports); - item = dp = &dports; - } - - /* Deliver a copy of message to each destination port */ - if (dp->count != 0) { - msg_set_destnode(msg, tipc_own_addr); - if (dp->count == 1) { - msg_set_destport(msg, dp->ports[0]); - tipc_sk_rcv(buf); - tipc_port_list_free(dp); - return; - } - for (; cnt < dp->count; cnt++) { - int index = cnt % PLSIZE; - struct sk_buff *b = skb_clone(buf, GFP_ATOMIC); - - if (b == NULL) { - pr_warn("Unable to deliver multicast message(s)\n"); - goto exit; - } - if ((index == 0) && (cnt != 0)) - item = item->next; - msg_set_destport(buf_msg(b), item->ports[index]); - tipc_sk_rcv(b); - } - } -exit: - kfree_skb(buf); - tipc_port_list_free(dp); -} - - -void tipc_port_wakeup(struct tipc_port *port) -{ - tipc_sock_wakeup(tipc_port_to_sock(port)); -} - /* tipc_port_init - intiate TIPC port and lock it * * Returns obtained reference if initialization is successful, zero otherwise @@ -235,6 +115,8 @@ u32 tipc_port_init(struct tipc_port *p_ptr, void tipc_port_destroy(struct tipc_port *p_ptr) { struct sk_buff *buf = NULL; + struct tipc_msg *msg = NULL; + u32 peer; tipc_withdraw(p_ptr, 0, NULL); @@ -246,14 +128,15 @@ void tipc_port_destroy(struct tipc_port *p_ptr) if (p_ptr->connected) { buf = port_build_peer_abort_msg(p_ptr, TIPC_ERR_NO_PORT); tipc_nodesub_unsubscribe(&p_ptr->subscription); + msg = buf_msg(buf); + peer = msg_destnode(msg); + tipc_link_xmit(buf, peer, msg_link_selector(msg)); } - spin_lock_bh(&tipc_port_list_lock); list_del(&p_ptr->port_list); list_del(&p_ptr->wait_list); spin_unlock_bh(&tipc_port_list_lock); k_term_timer(&p_ptr->timer); - tipc_net_route_msg(buf); } /* @@ -275,100 +158,16 @@ static struct sk_buff *port_build_proto_msg(struct tipc_port *p_ptr, msg_set_destport(msg, tipc_port_peerport(p_ptr)); msg_set_origport(msg, p_ptr->ref); msg_set_msgcnt(msg, ack); + buf->next = NULL; } return buf; } -int tipc_reject_msg(struct sk_buff *buf, u32 err) -{ - struct tipc_msg *msg = buf_msg(buf); - struct sk_buff *rbuf; - struct tipc_msg *rmsg; - int hdr_sz; - u32 imp; - u32 data_sz = msg_data_sz(msg); - u32 src_node; - u32 rmsg_sz; - - /* discard rejected message if it shouldn't be returned to sender */ - if (WARN(!msg_isdata(msg), - "attempt to reject message with user=%u", msg_user(msg))) { - dump_stack(); - goto exit; - } - if (msg_errcode(msg) || msg_dest_droppable(msg)) - goto exit; - - /* - * construct returned message by copying rejected message header and - * data (or subset), then updating header fields that need adjusting - */ - hdr_sz = msg_hdr_sz(msg); - rmsg_sz = hdr_sz + min_t(u32, data_sz, MAX_REJECT_SIZE); - - rbuf = tipc_buf_acquire(rmsg_sz); - if (rbuf == NULL) - goto exit; - - rmsg = buf_msg(rbuf); - skb_copy_to_linear_data(rbuf, msg, rmsg_sz); - - if (msg_connected(rmsg)) { - imp = msg_importance(rmsg); - if (imp < TIPC_CRITICAL_IMPORTANCE) - msg_set_importance(rmsg, ++imp); - } - msg_set_non_seq(rmsg, 0); - msg_set_size(rmsg, rmsg_sz); - msg_set_errcode(rmsg, err); - msg_set_prevnode(rmsg, tipc_own_addr); - msg_swap_words(rmsg, 4, 5); - if (!msg_short(rmsg)) - msg_swap_words(rmsg, 6, 7); - - /* send self-abort message when rejecting on a connected port */ - if (msg_connected(msg)) { - struct tipc_port *p_ptr = tipc_port_lock(msg_destport(msg)); - - if (p_ptr) { - struct sk_buff *abuf = NULL; - - if (p_ptr->connected) - abuf = port_build_self_abort_msg(p_ptr, err); - tipc_port_unlock(p_ptr); - tipc_net_route_msg(abuf); - } - } - - /* send returned message & dispose of rejected message */ - src_node = msg_prevnode(msg); - if (in_own_node(src_node)) - tipc_sk_rcv(rbuf); - else - tipc_link_xmit(rbuf, src_node, msg_link_selector(rmsg)); -exit: - kfree_skb(buf); - return data_sz; -} - -int tipc_port_iovec_reject(struct tipc_port *p_ptr, struct tipc_msg *hdr, - struct iovec const *msg_sect, unsigned int len, - int err) -{ - struct sk_buff *buf; - int res; - - res = tipc_msg_build(hdr, msg_sect, len, MAX_MSG_SIZE, &buf); - if (!buf) - return res; - - return tipc_reject_msg(buf, err); -} - static void port_timeout(unsigned long ref) { struct tipc_port *p_ptr = tipc_port_lock(ref); struct sk_buff *buf = NULL; + struct tipc_msg *msg = NULL; if (!p_ptr) return; @@ -379,15 +178,16 @@ static void port_timeout(unsigned long ref) } /* Last probe answered ? */ - if (p_ptr->probing_state == PROBING) { + if (p_ptr->probing_state == TIPC_CONN_PROBING) { buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_PORT); } else { buf = port_build_proto_msg(p_ptr, CONN_PROBE, 0); - p_ptr->probing_state = PROBING; + p_ptr->probing_state = TIPC_CONN_PROBING; k_start_timer(&p_ptr->timer, p_ptr->probing_interval); } tipc_port_unlock(p_ptr); - tipc_net_route_msg(buf); + msg = buf_msg(buf); + tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); } @@ -395,12 +195,14 @@ static void port_handle_node_down(unsigned long ref) { struct tipc_port *p_ptr = tipc_port_lock(ref); struct sk_buff *buf = NULL; + struct tipc_msg *msg = NULL; if (!p_ptr) return; buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_NODE); tipc_port_unlock(p_ptr); - tipc_net_route_msg(buf); + msg = buf_msg(buf); + tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); } @@ -412,6 +214,7 @@ static struct sk_buff *port_build_self_abort_msg(struct tipc_port *p_ptr, u32 er struct tipc_msg *msg = buf_msg(buf); msg_swap_words(msg, 4, 5); msg_swap_words(msg, 6, 7); + buf->next = NULL; } return buf; } @@ -436,60 +239,11 @@ static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *p_ptr, u32 er if (imp < TIPC_CRITICAL_IMPORTANCE) msg_set_importance(msg, ++imp); msg_set_errcode(msg, err); + buf->next = NULL; } return buf; } -void tipc_port_proto_rcv(struct sk_buff *buf) -{ - struct tipc_msg *msg = buf_msg(buf); - struct tipc_port *p_ptr; - struct sk_buff *r_buf = NULL; - u32 destport = msg_destport(msg); - int wakeable; - - /* Validate connection */ - p_ptr = tipc_port_lock(destport); - if (!p_ptr || !p_ptr->connected || !tipc_port_peer_msg(p_ptr, msg)) { - r_buf = tipc_buf_acquire(BASIC_H_SIZE); - if (r_buf) { - msg = buf_msg(r_buf); - tipc_msg_init(msg, TIPC_HIGH_IMPORTANCE, TIPC_CONN_MSG, - BASIC_H_SIZE, msg_orignode(msg)); - msg_set_errcode(msg, TIPC_ERR_NO_PORT); - msg_set_origport(msg, destport); - msg_set_destport(msg, msg_origport(msg)); - } - if (p_ptr) - tipc_port_unlock(p_ptr); - goto exit; - } - - /* Process protocol message sent by peer */ - switch (msg_type(msg)) { - case CONN_ACK: - wakeable = tipc_port_congested(p_ptr) && p_ptr->congested; - p_ptr->acked += msg_msgcnt(msg); - if (!tipc_port_congested(p_ptr)) { - p_ptr->congested = 0; - if (wakeable) - tipc_port_wakeup(p_ptr); - } - break; - case CONN_PROBE: - r_buf = port_build_proto_msg(p_ptr, CONN_PROBE_REPLY, 0); - break; - default: - /* CONN_PROBE_REPLY or unrecognized - no action required */ - break; - } - p_ptr->probing_state = CONFIRMED; - tipc_port_unlock(p_ptr); -exit: - tipc_net_route_msg(r_buf); - kfree_skb(buf); -} - static int port_print(struct tipc_port *p_ptr, char *buf, int len, int full_id) { struct publication *publ; @@ -581,16 +335,19 @@ void tipc_acknowledge(u32 ref, u32 ack) { struct tipc_port *p_ptr; struct sk_buff *buf = NULL; + struct tipc_msg *msg; p_ptr = tipc_port_lock(ref); if (!p_ptr) return; - if (p_ptr->connected) { - p_ptr->conn_unacked -= ack; + if (p_ptr->connected) buf = port_build_proto_msg(p_ptr, CONN_ACK, ack); - } + tipc_port_unlock(p_ptr); - tipc_net_route_msg(buf); + if (!buf) + return; + msg = buf_msg(buf); + tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); } int tipc_publish(struct tipc_port *p_ptr, unsigned int scope, @@ -689,7 +446,7 @@ int __tipc_port_connect(u32 ref, struct tipc_port *p_ptr, msg_set_hdr_sz(msg, SHORT_H_SIZE); p_ptr->probing_interval = PROBING_INTERVAL; - p_ptr->probing_state = CONFIRMED; + p_ptr->probing_state = TIPC_CONN_OK; p_ptr->connected = 1; k_start_timer(&p_ptr->timer, p_ptr->probing_interval); @@ -698,7 +455,7 @@ int __tipc_port_connect(u32 ref, struct tipc_port *p_ptr, (net_ev_handler)port_handle_node_down); res = 0; exit: - p_ptr->max_pkt = tipc_link_get_max_pkt(peer->node, ref); + p_ptr->max_pkt = tipc_node_get_mtu(peer->node, ref); return res; } @@ -741,6 +498,7 @@ int tipc_port_disconnect(u32 ref) */ int tipc_port_shutdown(u32 ref) { + struct tipc_msg *msg; struct tipc_port *p_ptr; struct sk_buff *buf = NULL; @@ -750,149 +508,7 @@ int tipc_port_shutdown(u32 ref) buf = port_build_peer_abort_msg(p_ptr, TIPC_CONN_SHUTDOWN); tipc_port_unlock(p_ptr); - tipc_net_route_msg(buf); + msg = buf_msg(buf); + tipc_link_xmit(buf, msg_destnode(msg), msg_link_selector(msg)); return tipc_port_disconnect(ref); } - -/* - * tipc_port_iovec_rcv: Concatenate and deliver sectioned - * message for this node. - */ -static int tipc_port_iovec_rcv(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len) -{ - struct sk_buff *buf; - int res; - - res = tipc_msg_build(&sender->phdr, msg_sect, len, MAX_MSG_SIZE, &buf); - if (likely(buf)) - tipc_sk_rcv(buf); - return res; -} - -/** - * tipc_send - send message sections on connection - */ -int tipc_send(struct tipc_port *p_ptr, - struct iovec const *msg_sect, - unsigned int len) -{ - u32 destnode; - int res; - - if (!p_ptr->connected) - return -EINVAL; - - p_ptr->congested = 1; - if (!tipc_port_congested(p_ptr)) { - destnode = tipc_port_peernode(p_ptr); - if (likely(!in_own_node(destnode))) - res = tipc_link_iovec_xmit_fast(p_ptr, msg_sect, len, - destnode); - else - res = tipc_port_iovec_rcv(p_ptr, msg_sect, len); - - if (likely(res != -ELINKCONG)) { - p_ptr->congested = 0; - if (res > 0) - p_ptr->sent++; - return res; - } - } - if (tipc_port_unreliable(p_ptr)) { - p_ptr->congested = 0; - return len; - } - return -ELINKCONG; -} - -/** - * tipc_send2name - send message sections to port name - */ -int tipc_send2name(struct tipc_port *p_ptr, - struct tipc_name const *name, - unsigned int domain, - struct iovec const *msg_sect, - unsigned int len) -{ - struct tipc_msg *msg; - u32 destnode = domain; - u32 destport; - int res; - - if (p_ptr->connected) - return -EINVAL; - - msg = &p_ptr->phdr; - msg_set_type(msg, TIPC_NAMED_MSG); - msg_set_hdr_sz(msg, NAMED_H_SIZE); - msg_set_nametype(msg, name->type); - msg_set_nameinst(msg, name->instance); - msg_set_lookup_scope(msg, tipc_addr_scope(domain)); - destport = tipc_nametbl_translate(name->type, name->instance, &destnode); - msg_set_destnode(msg, destnode); - msg_set_destport(msg, destport); - - if (likely(destport || destnode)) { - if (likely(in_own_node(destnode))) - res = tipc_port_iovec_rcv(p_ptr, msg_sect, len); - else if (tipc_own_addr) - res = tipc_link_iovec_xmit_fast(p_ptr, msg_sect, len, - destnode); - else - res = tipc_port_iovec_reject(p_ptr, msg, msg_sect, - len, TIPC_ERR_NO_NODE); - if (likely(res != -ELINKCONG)) { - if (res > 0) - p_ptr->sent++; - return res; - } - if (tipc_port_unreliable(p_ptr)) - return len; - - return -ELINKCONG; - } - return tipc_port_iovec_reject(p_ptr, msg, msg_sect, len, - TIPC_ERR_NO_NAME); -} - -/** - * tipc_send2port - send message sections to port identity - */ -int tipc_send2port(struct tipc_port *p_ptr, - struct tipc_portid const *dest, - struct iovec const *msg_sect, - unsigned int len) -{ - struct tipc_msg *msg; - int res; - - if (p_ptr->connected) - return -EINVAL; - - msg = &p_ptr->phdr; - msg_set_type(msg, TIPC_DIRECT_MSG); - msg_set_lookup_scope(msg, 0); - msg_set_destnode(msg, dest->node); - msg_set_destport(msg, dest->ref); - msg_set_hdr_sz(msg, BASIC_H_SIZE); - - if (in_own_node(dest->node)) - res = tipc_port_iovec_rcv(p_ptr, msg_sect, len); - else if (tipc_own_addr) - res = tipc_link_iovec_xmit_fast(p_ptr, msg_sect, len, - dest->node); - else - res = tipc_port_iovec_reject(p_ptr, msg, msg_sect, len, - TIPC_ERR_NO_NODE); - if (likely(res != -ELINKCONG)) { - if (res > 0) - p_ptr->sent++; - return res; - } - if (tipc_port_unreliable(p_ptr)) - return len; - - return -ELINKCONG; -} diff --git a/net/tipc/port.h b/net/tipc/port.h index cf4ca5b1d9a4..3f93454592b6 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -53,17 +53,13 @@ * @connected: non-zero if port is currently connected to a peer port * @conn_type: TIPC type used when connection was established * @conn_instance: TIPC instance used when connection was established - * @conn_unacked: number of unacknowledged messages received from peer port * @published: non-zero if port has one or more associated names - * @congested: non-zero if cannot send because of link or port congestion * @max_pkt: maximum packet size "hint" used when building messages sent by port * @ref: unique reference to port in TIPC object registry * @phdr: preformatted message header used when sending messages * @port_list: adjacent ports in TIPC's global list of ports * @wait_list: adjacent ports in list of ports waiting on link congestion * @waiting_pkts: - * @sent: # of non-empty messages sent by port - * @acked: # of non-empty message acknowledgements from connected port's peer * @publications: list of publications for port * @pub_count: total # of publications port has made during its lifetime * @probing_state: @@ -76,17 +72,13 @@ struct tipc_port { int connected; u32 conn_type; u32 conn_instance; - u32 conn_unacked; int published; - u32 congested; u32 max_pkt; u32 ref; struct tipc_msg phdr; struct list_head port_list; struct list_head wait_list; u32 waiting_pkts; - u32 sent; - u32 acked; struct list_head publications; u32 pub_count; u32 probing_state; @@ -104,8 +96,6 @@ struct tipc_port_list; u32 tipc_port_init(struct tipc_port *p_ptr, const unsigned int importance); -int tipc_reject_msg(struct sk_buff *buf, u32 err); - void tipc_acknowledge(u32 port_ref, u32 ack); void tipc_port_destroy(struct tipc_port *p_ptr); @@ -122,8 +112,6 @@ int tipc_port_disconnect(u32 portref); int tipc_port_shutdown(u32 ref); -void tipc_port_wakeup(struct tipc_port *port); - /* * The following routines require that the port be locked on entry */ @@ -132,39 +120,7 @@ int __tipc_port_connect(u32 ref, struct tipc_port *p_ptr, struct tipc_portid const *peer); int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg); -/* - * TIPC messaging routines - */ - -int tipc_send(struct tipc_port *port, - struct iovec const *msg_sect, - unsigned int len); - -int tipc_send2name(struct tipc_port *port, - struct tipc_name const *name, - u32 domain, - struct iovec const *msg_sect, - unsigned int len); - -int tipc_send2port(struct tipc_port *port, - struct tipc_portid const *dest, - struct iovec const *msg_sect, - unsigned int len); - -int tipc_port_mcast_xmit(struct tipc_port *port, - struct tipc_name_seq const *seq, - struct iovec const *msg, - unsigned int len); - -int tipc_port_iovec_reject(struct tipc_port *p_ptr, - struct tipc_msg *hdr, - struct iovec const *msg_sect, - unsigned int len, - int err); - struct sk_buff *tipc_port_get_ports(void); -void tipc_port_proto_rcv(struct sk_buff *buf); -void tipc_port_mcast_rcv(struct sk_buff *buf, struct tipc_port_list *dp); void tipc_port_reinit(void); /** @@ -185,12 +141,6 @@ static inline void tipc_port_unlock(struct tipc_port *p_ptr) spin_unlock_bh(p_ptr->lock); } -static inline int tipc_port_congested(struct tipc_port *p_ptr) -{ - return ((p_ptr->sent - p_ptr->acked) >= TIPC_FLOWCTRL_WIN); -} - - static inline u32 tipc_port_peernode(struct tipc_port *p_ptr) { return msg_destnode(&p_ptr->phdr); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ef0475568f9e..8477d08a6aa0 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -36,20 +36,24 @@ #include "core.h" #include "port.h" +#include "name_table.h" #include "node.h" - +#include "link.h" #include <linux/export.h> +#include "link.h" #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ +#define TIPC_FWD_MSG 1 static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); +static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -123,9 +127,12 @@ static void advance_rx_queue(struct sock *sk) static void reject_rx_queue(struct sock *sk) { struct sk_buff *buf; + u32 dnode; - while ((buf = __skb_dequeue(&sk->sk_receive_queue))) - tipc_reject_msg(buf, TIPC_ERR_NO_PORT); + while ((buf = __skb_dequeue(&sk->sk_receive_queue))) { + if (tipc_msg_reverse(buf, &dnode, TIPC_ERR_NO_PORT)) + tipc_link_xmit(buf, dnode, 0); + } } /** @@ -201,6 +208,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; + tsk->sent_unacked = 0; atomic_set(&tsk->dupl_rcvcnt, 0); tipc_port_unlock(port); @@ -303,6 +311,7 @@ static int tipc_release(struct socket *sock) struct tipc_sock *tsk; struct tipc_port *port; struct sk_buff *buf; + u32 dnode; /* * Exit if socket isn't fully initialized (occurs when a failed accept() @@ -331,7 +340,8 @@ static int tipc_release(struct socket *sock) sock->state = SS_DISCONNECTING; tipc_port_disconnect(port->ref); } - tipc_reject_msg(buf, TIPC_ERR_NO_PORT); + if (tipc_msg_reverse(buf, &dnode, TIPC_ERR_NO_PORT)) + tipc_link_xmit(buf, dnode, 0); } } @@ -504,12 +514,12 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, switch ((int)sock->state) { case SS_UNCONNECTED: - if (!tsk->port.congested) + if (!tsk->link_cong) mask |= POLLOUT; break; case SS_READY: case SS_CONNECTED: - if (!tsk->port.congested) + if (!tsk->link_cong && !tipc_sk_conn_cong(tsk)) mask |= POLLOUT; /* fall thru' */ case SS_CONNECTING: @@ -526,6 +536,136 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, } /** + * tipc_sendmcast - send multicast message + * @sock: socket structure + * @seq: destination address + * @iov: message data to send + * @dsz: total length of message data + * @timeo: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, + struct iovec *iov, size_t dsz, long timeo) +{ + struct sock *sk = sock->sk; + struct tipc_msg *mhdr = &tipc_sk(sk)->port.phdr; + struct sk_buff *buf; + uint mtu; + int rc; + + msg_set_type(mhdr, TIPC_MCAST_MSG); + msg_set_lookup_scope(mhdr, TIPC_CLUSTER_SCOPE); + msg_set_destport(mhdr, 0); + msg_set_destnode(mhdr, 0); + msg_set_nametype(mhdr, seq->type); + msg_set_namelower(mhdr, seq->lower); + msg_set_nameupper(mhdr, seq->upper); + msg_set_hdr_sz(mhdr, MCAST_H_SIZE); + +new_mtu: + mtu = tipc_bclink_get_mtu(); + rc = tipc_msg_build(mhdr, iov, 0, dsz, mtu, &buf); + if (unlikely(rc < 0)) + return rc; + + do { + rc = tipc_bclink_xmit(buf); + if (likely(rc >= 0)) { + rc = dsz; + break; + } + if (rc == -EMSGSIZE) + goto new_mtu; + if (rc != -ELINKCONG) + break; + rc = tipc_wait_for_sndmsg(sock, &timeo); + if (rc) + kfree_skb_list(buf); + } while (!rc); + return rc; +} + +/* tipc_sk_mcast_rcv - Deliver multicast message to all destination sockets + */ +void tipc_sk_mcast_rcv(struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + struct tipc_port_list dports = {0, NULL, }; + struct tipc_port_list *item; + struct sk_buff *b; + uint i, last, dst = 0; + u32 scope = TIPC_CLUSTER_SCOPE; + + if (in_own_node(msg_orignode(msg))) + scope = TIPC_NODE_SCOPE; + + /* Create destination port list: */ + tipc_nametbl_mc_translate(msg_nametype(msg), + msg_namelower(msg), + msg_nameupper(msg), + scope, + &dports); + last = dports.count; + if (!last) { + kfree_skb(buf); + return; + } + + for (item = &dports; item; item = item->next) { + for (i = 0; i < PLSIZE && ++dst <= last; i++) { + b = (dst != last) ? skb_clone(buf, GFP_ATOMIC) : buf; + if (!b) { + pr_warn("Failed do clone mcast rcv buffer\n"); + continue; + } + msg_set_destport(msg, item->ports[i]); + tipc_sk_rcv(b); + } + } + tipc_port_list_free(&dports); +} + +/** + * tipc_sk_proto_rcv - receive a connection mng protocol message + * @tsk: receiving socket + * @dnode: node to send response message to, if any + * @buf: buffer containing protocol message + * Returns 0 (TIPC_OK) if message was consumed, 1 (TIPC_FWD_MSG) if + * (CONN_PROBE_REPLY) message should be forwarded. + */ +static int tipc_sk_proto_rcv(struct tipc_sock *tsk, u32 *dnode, + struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + struct tipc_port *port = &tsk->port; + int conn_cong; + + /* Ignore if connection cannot be validated: */ + if (!port->connected || !tipc_port_peer_msg(port, msg)) + goto exit; + + port->probing_state = TIPC_CONN_OK; + + if (msg_type(msg) == CONN_ACK) { + conn_cong = tipc_sk_conn_cong(tsk); + tsk->sent_unacked -= msg_msgcnt(msg); + if (conn_cong) + tipc_sock_wakeup(tsk); + } else if (msg_type(msg) == CONN_PROBE) { + if (!tipc_msg_reverse(buf, dnode, TIPC_OK)) + return TIPC_OK; + msg_set_type(msg, CONN_PROBE_REPLY); + return TIPC_FWD_MSG; + } + /* Do nothing if msg_type() == CONN_PROBE_REPLY */ +exit: + kfree_skb(buf); + return TIPC_OK; +} + +/** * dest_name_check - verify user is permitted to send to specified port name * @dest: destination address * @m: descriptor for message to be sent @@ -539,6 +679,8 @@ static int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m) { struct tipc_cfg_msg_hdr hdr; + if (unlikely(dest->addrtype == TIPC_ADDR_ID)) + return 0; if (likely(dest->addr.name.name.type >= TIPC_RESERVED_TYPES)) return 0; if (likely(dest->addr.name.name.type == TIPC_TOP_SRV)) @@ -575,19 +717,18 @@ static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) return sock_intr_errno(*timeo_p); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - done = sk_wait_event(sk, timeo_p, !tsk->port.congested); + done = sk_wait_event(sk, timeo_p, !tsk->link_cong); finish_wait(sk_sleep(sk), &wait); } while (!done); return 0; } - /** * tipc_sendmsg - send message in connectionless manner * @iocb: if NULL, indicates that socket lock is already held * @sock: socket structure * @m: message to send - * @total_len: length of message + * @dsz: amount of user data to be sent * * Message must have an destination specified explicitly. * Used for SOCK_RDM and SOCK_DGRAM messages, @@ -597,100 +738,123 @@ static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) * Returns the number of bytes sent on success, or errno otherwise */ static int tipc_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) + struct msghdr *m, size_t dsz) { + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_port *port = &tsk->port; - DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); - int needs_conn; + struct tipc_msg *mhdr = &port->phdr; + struct iovec *iov = m->msg_iov; + u32 dnode, dport; + struct sk_buff *buf; + struct tipc_name_seq *seq = &dest->addr.nameseq; + u32 mtu; long timeo; - int res = -EINVAL; + int rc = -EINVAL; if (unlikely(!dest)) return -EDESTADDRREQ; + if (unlikely((m->msg_namelen < sizeof(*dest)) || (dest->family != AF_TIPC))) return -EINVAL; - if (total_len > TIPC_MAX_USER_MSG_SIZE) + + if (dsz > TIPC_MAX_USER_MSG_SIZE) return -EMSGSIZE; if (iocb) lock_sock(sk); - needs_conn = (sock->state != SS_READY); - if (unlikely(needs_conn)) { + if (unlikely(sock->state != SS_READY)) { if (sock->state == SS_LISTENING) { - res = -EPIPE; + rc = -EPIPE; goto exit; } if (sock->state != SS_UNCONNECTED) { - res = -EISCONN; + rc = -EISCONN; goto exit; } if (tsk->port.published) { - res = -EOPNOTSUPP; + rc = -EOPNOTSUPP; goto exit; } if (dest->addrtype == TIPC_ADDR_NAME) { tsk->port.conn_type = dest->addr.name.name.type; tsk->port.conn_instance = dest->addr.name.name.instance; } - - /* Abort any pending connection attempts (very unlikely) */ - reject_rx_queue(sk); } + rc = dest_name_check(dest, m); + if (rc) + goto exit; timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); - do { - if (dest->addrtype == TIPC_ADDR_NAME) { - res = dest_name_check(dest, m); - if (res) - break; - res = tipc_send2name(port, - &dest->addr.name.name, - dest->addr.name.domain, - m->msg_iov, - total_len); - } else if (dest->addrtype == TIPC_ADDR_ID) { - res = tipc_send2port(port, - &dest->addr.id, - m->msg_iov, - total_len); - } else if (dest->addrtype == TIPC_ADDR_MCAST) { - if (needs_conn) { - res = -EOPNOTSUPP; - break; - } - res = dest_name_check(dest, m); - if (res) - break; - res = tipc_port_mcast_xmit(port, - &dest->addr.nameseq, - m->msg_iov, - total_len); + + if (dest->addrtype == TIPC_ADDR_MCAST) { + rc = tipc_sendmcast(sock, seq, iov, dsz, timeo); + goto exit; + } else if (dest->addrtype == TIPC_ADDR_NAME) { + u32 type = dest->addr.name.name.type; + u32 inst = dest->addr.name.name.instance; + u32 domain = dest->addr.name.domain; + + dnode = domain; + msg_set_type(mhdr, TIPC_NAMED_MSG); + msg_set_hdr_sz(mhdr, NAMED_H_SIZE); + msg_set_nametype(mhdr, type); + msg_set_nameinst(mhdr, inst); + msg_set_lookup_scope(mhdr, tipc_addr_scope(domain)); + dport = tipc_nametbl_translate(type, inst, &dnode); + msg_set_destnode(mhdr, dnode); + msg_set_destport(mhdr, dport); + if (unlikely(!dport && !dnode)) { + rc = -EHOSTUNREACH; + goto exit; } - if (likely(res != -ELINKCONG)) { - if (needs_conn && (res >= 0)) + } else if (dest->addrtype == TIPC_ADDR_ID) { + dnode = dest->addr.id.node; + msg_set_type(mhdr, TIPC_DIRECT_MSG); + msg_set_lookup_scope(mhdr, 0); + msg_set_destnode(mhdr, dnode); + msg_set_destport(mhdr, dest->addr.id.ref); + msg_set_hdr_sz(mhdr, BASIC_H_SIZE); + } + +new_mtu: + mtu = tipc_node_get_mtu(dnode, tsk->port.ref); + rc = tipc_msg_build(mhdr, iov, 0, dsz, mtu, &buf); + if (rc < 0) + goto exit; + + do { + rc = tipc_link_xmit(buf, dnode, tsk->port.ref); + if (likely(rc >= 0)) { + if (sock->state != SS_READY) sock->state = SS_CONNECTING; + rc = dsz; break; } - res = tipc_wait_for_sndmsg(sock, &timeo); - if (res) + if (rc == -EMSGSIZE) + goto new_mtu; + + if (rc != -ELINKCONG) break; - } while (1); + rc = tipc_wait_for_sndmsg(sock, &timeo); + if (rc) + kfree_skb_list(buf); + } while (!rc); exit: if (iocb) release_sock(sk); - return res; + + return rc; } static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; DEFINE_WAIT(wait); int done; @@ -709,37 +873,49 @@ static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); done = sk_wait_event(sk, timeo_p, - (!port->congested || !port->connected)); + (!tsk->link_cong && + !tipc_sk_conn_cong(tsk)) || + !tsk->port.connected); finish_wait(sk_sleep(sk), &wait); } while (!done); return 0; } /** - * tipc_send_packet - send a connection-oriented message - * @iocb: if NULL, indicates that socket lock is already held + * tipc_send_stream - send stream-oriented data + * @iocb: (unused) * @sock: socket structure - * @m: message to send - * @total_len: length of message + * @m: data to send + * @dsz: total length of data to be transmitted * - * Used for SOCK_SEQPACKET messages and SOCK_STREAM data. + * Used for SOCK_STREAM data. * - * Returns the number of bytes sent on success, or errno otherwise + * Returns the number of bytes sent on success (or partial success), + * or errno if no data sent */ -static int tipc_send_packet(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) +static int tipc_send_stream(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t dsz) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_port *port = &tsk->port; + struct tipc_msg *mhdr = &port->phdr; + struct sk_buff *buf; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); - int res = -EINVAL; + u32 ref = port->ref; + int rc = -EINVAL; long timeo; + u32 dnode; + uint mtu, send, sent = 0; /* Handle implied connection establishment */ - if (unlikely(dest)) - return tipc_sendmsg(iocb, sock, m, total_len); - - if (total_len > TIPC_MAX_USER_MSG_SIZE) + if (unlikely(dest)) { + rc = tipc_sendmsg(iocb, sock, m, dsz); + if (dsz && (dsz == rc)) + tsk->sent_unacked = 1; + return rc; + } + if (dsz > (uint)INT_MAX) return -EMSGSIZE; if (iocb) @@ -747,123 +923,66 @@ static int tipc_send_packet(struct kiocb *iocb, struct socket *sock, if (unlikely(sock->state != SS_CONNECTED)) { if (sock->state == SS_DISCONNECTING) - res = -EPIPE; + rc = -EPIPE; else - res = -ENOTCONN; + rc = -ENOTCONN; goto exit; } timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + dnode = tipc_port_peernode(port); + +next: + mtu = port->max_pkt; + send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE); + rc = tipc_msg_build(mhdr, m->msg_iov, sent, send, mtu, &buf); + if (unlikely(rc < 0)) + goto exit; do { - res = tipc_send(&tsk->port, m->msg_iov, total_len); - if (likely(res != -ELINKCONG)) - break; - res = tipc_wait_for_sndpkt(sock, &timeo); - if (res) - break; - } while (1); + if (likely(!tipc_sk_conn_cong(tsk))) { + rc = tipc_link_xmit(buf, dnode, ref); + if (likely(!rc)) { + tsk->sent_unacked++; + sent += send; + if (sent == dsz) + break; + goto next; + } + if (rc == -EMSGSIZE) { + port->max_pkt = tipc_node_get_mtu(dnode, ref); + goto next; + } + if (rc != -ELINKCONG) + break; + } + rc = tipc_wait_for_sndpkt(sock, &timeo); + if (rc) + kfree_skb_list(buf); + } while (!rc); exit: if (iocb) release_sock(sk); - return res; + return sent ? sent : rc; } /** - * tipc_send_stream - send stream-oriented data - * @iocb: (unused) + * tipc_send_packet - send a connection-oriented message + * @iocb: if NULL, indicates that socket lock is already held * @sock: socket structure - * @m: data to send - * @total_len: total length of data to be sent + * @m: message to send + * @dsz: length of data to be transmitted * - * Used for SOCK_STREAM data. + * Used for SOCK_SEQPACKET messages. * - * Returns the number of bytes sent on success (or partial success), - * or errno if no data sent + * Returns the number of bytes sent on success, or errno otherwise */ -static int tipc_send_stream(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) +static int tipc_send_packet(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t dsz) { - struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); - struct msghdr my_msg; - struct iovec my_iov; - struct iovec *curr_iov; - int curr_iovlen; - char __user *curr_start; - u32 hdr_size; - int curr_left; - int bytes_to_send; - int bytes_sent; - int res; - - lock_sock(sk); - - /* Handle special cases where there is no connection */ - if (unlikely(sock->state != SS_CONNECTED)) { - if (sock->state == SS_UNCONNECTED) - res = tipc_send_packet(NULL, sock, m, total_len); - else - res = sock->state == SS_DISCONNECTING ? -EPIPE : -ENOTCONN; - goto exit; - } - - if (unlikely(m->msg_name)) { - res = -EISCONN; - goto exit; - } - - if (total_len > (unsigned int)INT_MAX) { - res = -EMSGSIZE; - goto exit; - } - - /* - * Send each iovec entry using one or more messages - * - * Note: This algorithm is good for the most likely case - * (i.e. one large iovec entry), but could be improved to pass sets - * of small iovec entries into send_packet(). - */ - curr_iov = m->msg_iov; - curr_iovlen = m->msg_iovlen; - my_msg.msg_iov = &my_iov; - my_msg.msg_iovlen = 1; - my_msg.msg_flags = m->msg_flags; - my_msg.msg_name = NULL; - bytes_sent = 0; - - hdr_size = msg_hdr_sz(&tsk->port.phdr); - - while (curr_iovlen--) { - curr_start = curr_iov->iov_base; - curr_left = curr_iov->iov_len; - - while (curr_left) { - bytes_to_send = tsk->port.max_pkt - hdr_size; - if (bytes_to_send > TIPC_MAX_USER_MSG_SIZE) - bytes_to_send = TIPC_MAX_USER_MSG_SIZE; - if (curr_left < bytes_to_send) - bytes_to_send = curr_left; - my_iov.iov_base = curr_start; - my_iov.iov_len = bytes_to_send; - res = tipc_send_packet(NULL, sock, &my_msg, - bytes_to_send); - if (res < 0) { - if (bytes_sent) - res = bytes_sent; - goto exit; - } - curr_left -= bytes_to_send; - curr_start += bytes_to_send; - bytes_sent += bytes_to_send; - } + if (dsz > TIPC_MAX_USER_MSG_SIZE) + return -EMSGSIZE; - curr_iov++; - } - res = bytes_sent; -exit: - release_sock(sk); - return res; + return tipc_send_stream(iocb, sock, m, dsz); } /** @@ -1104,8 +1223,10 @@ restart: /* Consume received message (optional) */ if (likely(!(flags & MSG_PEEK))) { if ((sock->state != SS_READY) && - (++port->conn_unacked >= TIPC_CONNACK_INTV)) - tipc_acknowledge(port->ref, port->conn_unacked); + (++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { + tipc_acknowledge(port->ref, tsk->rcv_unacked); + tsk->rcv_unacked = 0; + } advance_rx_queue(sk); } exit: @@ -1213,8 +1334,10 @@ restart: /* Consume received message (optional) */ if (likely(!(flags & MSG_PEEK))) { - if (unlikely(++port->conn_unacked >= TIPC_CONNACK_INTV)) - tipc_acknowledge(port->ref, port->conn_unacked); + if (unlikely(++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { + tipc_acknowledge(port->ref, tsk->rcv_unacked); + tsk->rcv_unacked = 0; + } advance_rx_queue(sk); } @@ -1269,17 +1392,16 @@ static void tipc_data_ready(struct sock *sk) * @tsk: TIPC socket * @msg: message * - * Returns TIPC error status code and socket error status code - * once it encounters some errors + * Returns 0 (TIPC_OK) if everyting ok, -TIPC_ERR_NO_PORT otherwise */ -static u32 filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) +static int filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) { struct sock *sk = &tsk->sk; struct tipc_port *port = &tsk->port; struct socket *sock = sk->sk_socket; struct tipc_msg *msg = buf_msg(*buf); - u32 retval = TIPC_ERR_NO_PORT; + int retval = -TIPC_ERR_NO_PORT; int res; if (msg_mcast(msg)) @@ -1382,32 +1504,37 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) * * Called with socket lock already taken; port lock may also be taken. * - * Returns TIPC error status code (TIPC_OK if message is not to be rejected) + * Returns 0 (TIPC_OK) if message was consumed, -TIPC error code if message + * to be rejected, 1 (TIPC_FWD_MSG) if (CONN_MANAGER) message to be forwarded */ -static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) +static int filter_rcv(struct sock *sk, struct sk_buff *buf) { struct socket *sock = sk->sk_socket; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *msg = buf_msg(buf); unsigned int limit = rcvbuf_limit(sk, buf); - u32 res = TIPC_OK; + u32 onode; + int rc = TIPC_OK; + + if (unlikely(msg_user(msg) == CONN_MANAGER)) + return tipc_sk_proto_rcv(tsk, &onode, buf); /* Reject message if it is wrong sort of message for socket */ if (msg_type(msg) > TIPC_DIRECT_MSG) - return TIPC_ERR_NO_PORT; + return -TIPC_ERR_NO_PORT; if (sock->state == SS_READY) { if (msg_connected(msg)) - return TIPC_ERR_NO_PORT; + return -TIPC_ERR_NO_PORT; } else { - res = filter_connect(tsk, &buf); - if (res != TIPC_OK || buf == NULL) - return res; + rc = filter_connect(tsk, &buf); + if (rc != TIPC_OK || buf == NULL) + return rc; } /* Reject message if there isn't room to queue it */ if (sk_rmem_alloc_get(sk) + buf->truesize >= limit) - return TIPC_ERR_OVERLOAD; + return -TIPC_ERR_OVERLOAD; /* Enqueue message */ TIPC_SKB_CB(buf)->handle = NULL; @@ -1429,16 +1556,23 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) */ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *buf) { - u32 res; + int rc; + u32 onode; struct tipc_sock *tsk = tipc_sk(sk); uint truesize = buf->truesize; - res = filter_rcv(sk, buf); - if (unlikely(res)) - tipc_reject_msg(buf, res); + rc = filter_rcv(sk, buf); - if (atomic_read(&tsk->dupl_rcvcnt) < TIPC_CONN_OVERLOAD_LIMIT) - atomic_add(truesize, &tsk->dupl_rcvcnt); + if (likely(!rc)) { + if (atomic_read(&tsk->dupl_rcvcnt) < TIPC_CONN_OVERLOAD_LIMIT) + atomic_add(truesize, &tsk->dupl_rcvcnt); + return 0; + } + + if ((rc < 0) && !tipc_msg_reverse(buf, &onode, -rc)) + return 0; + + tipc_link_xmit(buf, onode, 0); return 0; } @@ -1455,19 +1589,14 @@ int tipc_sk_rcv(struct sk_buff *buf) struct tipc_port *port; struct sock *sk; u32 dport = msg_destport(buf_msg(buf)); - int err = TIPC_OK; + int rc = TIPC_OK; uint limit; + u32 dnode; - /* Forward unresolved named message */ - if (unlikely(!dport)) { - tipc_net_route_msg(buf); - return 0; - } - - /* Validate destination */ + /* Validate destination and message */ port = tipc_port_lock(dport); if (unlikely(!port)) { - err = TIPC_ERR_NO_PORT; + rc = tipc_msg_eval(buf, &dnode); goto exit; } @@ -1478,23 +1607,25 @@ int tipc_sk_rcv(struct sk_buff *buf) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - err = filter_rcv(sk, buf); + rc = filter_rcv(sk, buf); } else { if (sk->sk_backlog.len == 0) atomic_set(&tsk->dupl_rcvcnt, 0); limit = rcvbuf_limit(sk, buf) + atomic_read(&tsk->dupl_rcvcnt); if (sk_add_backlog(sk, buf, limit)) - err = TIPC_ERR_OVERLOAD; + rc = -TIPC_ERR_OVERLOAD; } - bh_unlock_sock(sk); tipc_port_unlock(port); - if (likely(!err)) + if (likely(!rc)) return 0; exit: - tipc_reject_msg(buf, err); - return -EHOSTUNREACH; + if ((rc < 0) && !tipc_msg_reverse(buf, &dnode, -rc)) + return -EHOSTUNREACH; + + tipc_link_xmit(buf, dnode, 0); + return (rc < 0) ? -EHOSTUNREACH : 0; } static int tipc_wait_for_connect(struct socket *sock, long *timeo_p) @@ -1758,6 +1889,7 @@ static int tipc_shutdown(struct socket *sock, int how) struct tipc_sock *tsk = tipc_sk(sk); struct tipc_port *port = &tsk->port; struct sk_buff *buf; + u32 peer; int res; if (how != SHUT_RDWR) @@ -1778,7 +1910,8 @@ restart: goto restart; } tipc_port_disconnect(port->ref); - tipc_reject_msg(buf, TIPC_CONN_SHUTDOWN); + if (tipc_msg_reverse(buf, &peer, TIPC_CONN_SHUTDOWN)) + tipc_link_xmit(buf, peer, 0); } else { tipc_port_shutdown(port->ref); } @@ -1936,7 +2069,7 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, return put_user(sizeof(value), ol); } -int tipc_ioctl(struct socket *sk, unsigned int cmd, unsigned long arg) +static int tipc_ioctl(struct socket *sk, unsigned int cmd, unsigned long arg) { struct tipc_sioc_ln_req lnr; void __user *argp = (void __user *)arg; @@ -1952,7 +2085,6 @@ int tipc_ioctl(struct socket *sk, unsigned int cmd, unsigned long arg) return 0; } return -EADDRNOTAVAIL; - break; default: return -ENOIOCTLCMD; } diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 3afcd2a70b31..43b75b3ceced 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -38,6 +38,9 @@ #include "port.h" #include <net/sock.h> +#define TIPC_CONN_OK 0 +#define TIPC_CONN_PROBING 1 + /** * struct tipc_sock - TIPC socket structure * @sk: socket - interacts with 'port' and with user via the socket API @@ -45,6 +48,9 @@ * @peer_name: the peer of the connection, if any * @conn_timeout: the time we can wait for an unresponded setup request * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue + * @link_cong: non-zero if owner must sleep because of link congestion + * @sent_unacked: # messages sent by socket, and not yet acked by peer + * @rcv_unacked: # messages read by user, but not yet acked back to peer */ struct tipc_sock { @@ -52,6 +58,9 @@ struct tipc_sock { struct tipc_port port; unsigned int conn_timeout; atomic_t dupl_rcvcnt; + int link_cong; + uint sent_unacked; + uint rcv_unacked; }; static inline struct tipc_sock *tipc_sk(const struct sock *sk) @@ -69,6 +78,13 @@ static inline void tipc_sock_wakeup(struct tipc_sock *tsk) tsk->sk.sk_write_space(&tsk->sk); } +static inline int tipc_sk_conn_cong(struct tipc_sock *tsk) +{ + return tsk->sent_unacked >= TIPC_FLOWCTRL_WIN; +} + int tipc_sk_rcv(struct sk_buff *buf); +void tipc_sk_mcast_rcv(struct sk_buff *buf); + #endif |