diff options
Diffstat (limited to 'net')
646 files changed, 31189 insertions, 20935 deletions
diff --git a/net/6lowpan/Kconfig b/net/6lowpan/Kconfig new file mode 100644 index 000000000000..e4a02ef55102 --- /dev/null +++ b/net/6lowpan/Kconfig @@ -0,0 +1,6 @@ +config 6LOWPAN + tristate "6LoWPAN Support" + depends on IPV6 + ---help--- + This enables IPv6 over Low power Wireless Personal Area Network - + "6LoWPAN" which is supported by IEEE 802.15.4 or Bluetooth stacks. diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile new file mode 100644 index 000000000000..415886bb456a --- /dev/null +++ b/net/6lowpan/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_6LOWPAN) := 6lowpan.o + +6lowpan-y := iphc.o diff --git a/net/ieee802154/6lowpan_iphc.c b/net/6lowpan/iphc.c index 211b5686d719..142eef55c9e2 100644 --- a/net/ieee802154/6lowpan_iphc.c +++ b/net/6lowpan/iphc.c @@ -3,8 +3,7 @@ * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ -/* - * Based on patches from Jon Smirl <jonsmirl@gmail.com> +/* Based on patches from Jon Smirl <jonsmirl@gmail.com> * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com> * * This program is free software; you can redistribute it and/or modify @@ -58,16 +57,15 @@ #include <net/ipv6.h> #include <net/af_ieee802154.h> -/* - * Uncompress address function for source and +/* Uncompress address function for source and * destination address(non-multicast). * * address_mode is sam value or dam value. */ static int uncompress_addr(struct sk_buff *skb, - struct in6_addr *ipaddr, const u8 address_mode, - const u8 *lladdr, const u8 addr_type, - const u8 addr_len) + struct in6_addr *ipaddr, const u8 address_mode, + const u8 *lladdr, const u8 addr_type, + const u8 addr_len) { bool fail; @@ -140,13 +138,12 @@ static int uncompress_addr(struct sk_buff *skb, return 0; } -/* - * Uncompress address function for source context +/* Uncompress address function for source context * based address(non-multicast). */ static int uncompress_context_based_src_addr(struct sk_buff *skb, - struct in6_addr *ipaddr, - const u8 sam) + struct in6_addr *ipaddr, + const u8 sam) { switch (sam) { case LOWPAN_IPHC_ADDR_00: @@ -175,13 +172,13 @@ static int uncompress_context_based_src_addr(struct sk_buff *skb, } static int skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr, - struct net_device *dev, skb_delivery_cb deliver_skb) + struct net_device *dev, skb_delivery_cb deliver_skb) { struct sk_buff *new; int stat; new = skb_copy_expand(skb, sizeof(struct ipv6hdr), skb_tailroom(skb), - GFP_ATOMIC); + GFP_ATOMIC); kfree_skb(skb); if (!new) @@ -196,7 +193,7 @@ static int skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr, new->dev = dev; raw_dump_table(__func__, "raw skb data dump before receiving", - new->data, new->len); + new->data, new->len); stat = deliver_skb(new, dev); @@ -208,10 +205,9 @@ static int skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr, /* Uncompress function for multicast destination address, * when M bit is set. */ -static int -lowpan_uncompress_multicast_daddr(struct sk_buff *skb, - struct in6_addr *ipaddr, - const u8 dam) +static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb, + struct in6_addr *ipaddr, + const u8 dam) { bool fail; @@ -257,41 +253,41 @@ lowpan_uncompress_multicast_daddr(struct sk_buff *skb, } raw_dump_inline(NULL, "Reconstructed ipv6 multicast addr is", - ipaddr->s6_addr, 16); + ipaddr->s6_addr, 16); return 0; } -static int -uncompress_udp_header(struct sk_buff *skb, struct udphdr *uh) +static int uncompress_udp_header(struct sk_buff *skb, struct udphdr *uh) { bool fail; u8 tmp = 0, val = 0; - if (!uh) - goto err; - - fail = lowpan_fetch_skb(skb, &tmp, 1); + fail = lowpan_fetch_skb(skb, &tmp, sizeof(tmp)); if ((tmp & LOWPAN_NHC_UDP_MASK) == LOWPAN_NHC_UDP_ID) { pr_debug("UDP header uncompression\n"); switch (tmp & LOWPAN_NHC_UDP_CS_P_11) { case LOWPAN_NHC_UDP_CS_P_00: - fail |= lowpan_fetch_skb(skb, &uh->source, 2); - fail |= lowpan_fetch_skb(skb, &uh->dest, 2); + fail |= lowpan_fetch_skb(skb, &uh->source, + sizeof(uh->source)); + fail |= lowpan_fetch_skb(skb, &uh->dest, + sizeof(uh->dest)); break; case LOWPAN_NHC_UDP_CS_P_01: - fail |= lowpan_fetch_skb(skb, &uh->source, 2); - fail |= lowpan_fetch_skb(skb, &val, 1); + fail |= lowpan_fetch_skb(skb, &uh->source, + sizeof(uh->source)); + fail |= lowpan_fetch_skb(skb, &val, sizeof(val)); uh->dest = htons(val + LOWPAN_NHC_UDP_8BIT_PORT); break; case LOWPAN_NHC_UDP_CS_P_10: - fail |= lowpan_fetch_skb(skb, &val, 1); + fail |= lowpan_fetch_skb(skb, &val, sizeof(val)); uh->source = htons(val + LOWPAN_NHC_UDP_8BIT_PORT); - fail |= lowpan_fetch_skb(skb, &uh->dest, 2); + fail |= lowpan_fetch_skb(skb, &uh->dest, + sizeof(uh->dest)); break; case LOWPAN_NHC_UDP_CS_P_11: - fail |= lowpan_fetch_skb(skb, &val, 1); + fail |= lowpan_fetch_skb(skb, &val, sizeof(val)); uh->source = htons(LOWPAN_NHC_UDP_4BIT_PORT + (val >> 4)); uh->dest = htons(LOWPAN_NHC_UDP_4BIT_PORT + @@ -300,7 +296,6 @@ uncompress_udp_header(struct sk_buff *skb, struct udphdr *uh) default: pr_debug("ERROR: unknown UDP format\n"); goto err; - break; } pr_debug("uncompressed UDP ports: src = %d, dst = %d\n", @@ -311,11 +306,11 @@ uncompress_udp_header(struct sk_buff *skb, struct udphdr *uh) pr_debug_ratelimited("checksum elided currently not supported\n"); goto err; } else { - fail |= lowpan_fetch_skb(skb, &uh->check, 2); + fail |= lowpan_fetch_skb(skb, &uh->check, + sizeof(uh->check)); } - /* - * UDP lenght needs to be infered from the lower layers + /* UDP length needs to be infered from the lower layers * here, we obtain the hint from the remaining size of the * frame */ @@ -338,21 +333,21 @@ err: static const u8 lowpan_ttl_values[] = { 0, 1, 64, 255 }; int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, - const u8 *saddr, const u8 saddr_type, const u8 saddr_len, - const u8 *daddr, const u8 daddr_type, const u8 daddr_len, - u8 iphc0, u8 iphc1, skb_delivery_cb deliver_skb) + const u8 *saddr, const u8 saddr_type, const u8 saddr_len, + const u8 *daddr, const u8 daddr_type, const u8 daddr_len, + u8 iphc0, u8 iphc1, skb_delivery_cb deliver_skb) { struct ipv6hdr hdr = {}; u8 tmp, num_context = 0; int err; raw_dump_table(__func__, "raw skb data dump uncompressed", - skb->data, skb->len); + skb->data, skb->len); /* another if the CID flag is set */ if (iphc1 & LOWPAN_IPHC_CID) { pr_debug("CID flag is set, increase header with one\n"); - if (lowpan_fetch_skb_u8(skb, &num_context)) + if (lowpan_fetch_skb(skb, &num_context, sizeof(num_context))) goto drop; } @@ -360,12 +355,11 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, /* Traffic Class and Flow Label */ switch ((iphc0 & LOWPAN_IPHC_TF) >> 3) { - /* - * Traffic Class and FLow Label carried in-line + /* Traffic Class and FLow Label carried in-line * ECN + DSCP + 4-bit Pad + Flow Label (4 bytes) */ case 0: /* 00b */ - if (lowpan_fetch_skb_u8(skb, &tmp)) + if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp))) goto drop; memcpy(&hdr.flow_lbl, &skb->data[0], 3); @@ -374,23 +368,21 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, hdr.flow_lbl[0] = ((tmp >> 2) & 0x30) | (tmp << 6) | (hdr.flow_lbl[0] & 0x0f); break; - /* - * Traffic class carried in-line + /* Traffic class carried in-line * ECN + DSCP (1 byte), Flow Label is elided */ case 2: /* 10b */ - if (lowpan_fetch_skb_u8(skb, &tmp)) + if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp))) goto drop; hdr.priority = ((tmp >> 2) & 0x0f); hdr.flow_lbl[0] = ((tmp << 6) & 0xC0) | ((tmp >> 2) & 0x30); break; - /* - * Flow Label carried in-line + /* Flow Label carried in-line * ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided */ case 1: /* 01b */ - if (lowpan_fetch_skb_u8(skb, &tmp)) + if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp))) goto drop; hdr.flow_lbl[0] = (skb->data[0] & 0x0F) | ((tmp >> 2) & 0x30); @@ -407,7 +399,7 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, /* Next Header */ if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) { /* Next header is carried inline */ - if (lowpan_fetch_skb_u8(skb, &(hdr.nexthdr))) + if (lowpan_fetch_skb(skb, &hdr.nexthdr, sizeof(hdr.nexthdr))) goto drop; pr_debug("NH flag is set, next header carried inline: %02x\n", @@ -415,10 +407,11 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, } /* Hop Limit */ - if ((iphc0 & 0x03) != LOWPAN_IPHC_TTL_I) + if ((iphc0 & 0x03) != LOWPAN_IPHC_TTL_I) { hdr.hop_limit = lowpan_ttl_values[iphc0 & 0x03]; - else { - if (lowpan_fetch_skb_u8(skb, &(hdr.hop_limit))) + } else { + if (lowpan_fetch_skb(skb, &hdr.hop_limit, + sizeof(hdr.hop_limit))) goto drop; } @@ -428,13 +421,12 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, if (iphc1 & LOWPAN_IPHC_SAC) { /* Source address context based uncompression */ pr_debug("SAC bit is set. Handle context based source address.\n"); - err = uncompress_context_based_src_addr( - skb, &hdr.saddr, tmp); + err = uncompress_context_based_src_addr(skb, &hdr.saddr, tmp); } else { /* Source address uncompression */ pr_debug("source address stateless compression\n"); err = uncompress_addr(skb, &hdr.saddr, tmp, saddr, - saddr_type, saddr_len); + saddr_type, saddr_len); } /* Check on error of previous branch */ @@ -450,16 +442,17 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, pr_debug("dest: context-based mcast compression\n"); /* TODO: implement this */ } else { - err = lowpan_uncompress_multicast_daddr( - skb, &hdr.daddr, tmp); + err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr, + tmp); + if (err) goto drop; } } else { err = uncompress_addr(skb, &hdr.daddr, tmp, daddr, - daddr_type, daddr_len); + daddr_type, daddr_len); pr_debug("dest: stateless compression mode %d dest %pI6c\n", - tmp, &hdr.daddr); + tmp, &hdr.daddr); if (err) goto drop; } @@ -468,11 +461,11 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, if (iphc0 & LOWPAN_IPHC_NH_C) { struct udphdr uh; struct sk_buff *new; + if (uncompress_udp_header(skb, &uh)) goto drop; - /* - * replace the compressed UDP head by the uncompressed UDP + /* replace the compressed UDP head by the uncompressed UDP * header */ new = skb_copy_expand(skb, sizeof(struct udphdr), @@ -489,7 +482,7 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, skb_copy_to_linear_data(skb, &uh, sizeof(struct udphdr)); raw_dump_table(__func__, "raw UDP header dump", - (u8 *)&uh, sizeof(uh)); + (u8 *)&uh, sizeof(uh)); hdr.nexthdr = UIP_PROTO_UDP; } @@ -504,8 +497,7 @@ int lowpan_process_data(struct sk_buff *skb, struct net_device *dev, hdr.version, ntohs(hdr.payload_len), hdr.nexthdr, hdr.hop_limit, &hdr.daddr); - raw_dump_table(__func__, "raw header dump", (u8 *)&hdr, - sizeof(hdr)); + raw_dump_table(__func__, "raw header dump", (u8 *)&hdr, sizeof(hdr)); return skb_deliver(skb, &hdr, dev, deliver_skb); @@ -515,9 +507,9 @@ drop: } EXPORT_SYMBOL_GPL(lowpan_process_data); -static u8 lowpan_compress_addr_64(u8 **hc06_ptr, u8 shift, - const struct in6_addr *ipaddr, - const unsigned char *lladdr) +static u8 lowpan_compress_addr_64(u8 **hc_ptr, u8 shift, + const struct in6_addr *ipaddr, + const unsigned char *lladdr) { u8 val = 0; @@ -526,24 +518,22 @@ static u8 lowpan_compress_addr_64(u8 **hc06_ptr, u8 shift, pr_debug("address compression 0 bits\n"); } else if (lowpan_is_iid_16_bit_compressable(ipaddr)) { /* compress IID to 16 bits xxxx::XXXX */ - memcpy(*hc06_ptr, &ipaddr->s6_addr16[7], 2); - *hc06_ptr += 2; + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[7], 2); val = 2; /* 16-bits */ raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)", - *hc06_ptr - 2, 2); + *hc_ptr - 2, 2); } else { /* do not compress IID => xxxx::IID */ - memcpy(*hc06_ptr, &ipaddr->s6_addr16[4], 8); - *hc06_ptr += 8; + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8); val = 1; /* 64-bits */ raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)", - *hc06_ptr - 8, 8); + *hc_ptr - 8, 8); } return rol8(val, shift); } -static void compress_udp_header(u8 **hc06_ptr, struct sk_buff *skb) +static void compress_udp_header(u8 **hc_ptr, struct sk_buff *skb) { struct udphdr *uh = udp_hdr(skb); u8 tmp; @@ -555,75 +545,75 @@ static void compress_udp_header(u8 **hc06_ptr, struct sk_buff *skb) pr_debug("UDP header: both ports compression to 4 bits\n"); /* compression value */ tmp = LOWPAN_NHC_UDP_CS_P_11; - lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); /* source and destination port */ tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_4BIT_PORT + ((ntohs(uh->source) - LOWPAN_NHC_UDP_4BIT_PORT) << 4); - lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); } else if ((ntohs(uh->dest) & LOWPAN_NHC_UDP_8BIT_MASK) == LOWPAN_NHC_UDP_8BIT_PORT) { pr_debug("UDP header: remove 8 bits of dest\n"); /* compression value */ tmp = LOWPAN_NHC_UDP_CS_P_01; - lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); /* source port */ - lowpan_push_hc_data(hc06_ptr, &uh->source, sizeof(uh->source)); + lowpan_push_hc_data(hc_ptr, &uh->source, sizeof(uh->source)); /* destination port */ tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_8BIT_PORT; - lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); } else if ((ntohs(uh->source) & LOWPAN_NHC_UDP_8BIT_MASK) == LOWPAN_NHC_UDP_8BIT_PORT) { pr_debug("UDP header: remove 8 bits of source\n"); /* compression value */ tmp = LOWPAN_NHC_UDP_CS_P_10; - lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); /* source port */ tmp = ntohs(uh->source) - LOWPAN_NHC_UDP_8BIT_PORT; - lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); /* destination port */ - lowpan_push_hc_data(hc06_ptr, &uh->dest, sizeof(uh->dest)); + lowpan_push_hc_data(hc_ptr, &uh->dest, sizeof(uh->dest)); } else { pr_debug("UDP header: can't compress\n"); /* compression value */ tmp = LOWPAN_NHC_UDP_CS_P_00; - lowpan_push_hc_data(hc06_ptr, &tmp, sizeof(tmp)); + lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp)); /* source port */ - lowpan_push_hc_data(hc06_ptr, &uh->source, sizeof(uh->source)); + lowpan_push_hc_data(hc_ptr, &uh->source, sizeof(uh->source)); /* destination port */ - lowpan_push_hc_data(hc06_ptr, &uh->dest, sizeof(uh->dest)); + lowpan_push_hc_data(hc_ptr, &uh->dest, sizeof(uh->dest)); } /* checksum is always inline */ - lowpan_push_hc_data(hc06_ptr, &uh->check, sizeof(uh->check)); + lowpan_push_hc_data(hc_ptr, &uh->check, sizeof(uh->check)); /* skip the UDP header */ skb_pull(skb, sizeof(struct udphdr)); } int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, - unsigned short type, const void *_daddr, - const void *_saddr, unsigned int len) + unsigned short type, const void *_daddr, + const void *_saddr, unsigned int len) { - u8 tmp, iphc0, iphc1, *hc06_ptr; + u8 tmp, iphc0, iphc1, *hc_ptr; struct ipv6hdr *hdr; u8 head[100] = {}; + int addr_type; if (type != ETH_P_IPV6) return -EINVAL; hdr = ipv6_hdr(skb); - hc06_ptr = head + 2; + hc_ptr = head + 2; pr_debug("IPv6 header dump:\n\tversion = %d\n\tlength = %d\n" "\tnexthdr = 0x%02x\n\thop_lim = %d\n\tdest = %pI6c\n", - hdr->version, ntohs(hdr->payload_len), hdr->nexthdr, - hdr->hop_limit, &hdr->daddr); + hdr->version, ntohs(hdr->payload_len), hdr->nexthdr, + hdr->hop_limit, &hdr->daddr); raw_dump_table(__func__, "raw skb network header dump", - skb_network_header(skb), sizeof(struct ipv6hdr)); + skb_network_header(skb), sizeof(struct ipv6hdr)); - /* - * As we copy some bit-length fields, in the IPHC encoding bytes, + /* As we copy some bit-length fields, in the IPHC encoding bytes, * we sometimes use |= * If the field is 0, and the current bit value in memory is 1, * this does not work. We therefore reset the IPHC encoding here @@ -638,49 +628,47 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, raw_dump_inline(__func__, "daddr", (unsigned char *)_daddr, IEEE802154_ADDR_LEN); - raw_dump_table(__func__, - "sending raw skb network uncompressed packet", - skb->data, skb->len); + raw_dump_table(__func__, "sending raw skb network uncompressed packet", + skb->data, skb->len); - /* - * Traffic class, flow label + /* Traffic class, flow label * If flow label is 0, compress it. If traffic class is 0, compress it * We have to process both in the same time as the offset of traffic * class depends on the presence of version and flow label */ - /* hc06 format of TC is ECN | DSCP , original one is DSCP | ECN */ + /* hc format of TC is ECN | DSCP , original one is DSCP | ECN */ tmp = (hdr->priority << 4) | (hdr->flow_lbl[0] >> 4); tmp = ((tmp & 0x03) << 6) | (tmp >> 2); if (((hdr->flow_lbl[0] & 0x0F) == 0) && - (hdr->flow_lbl[1] == 0) && (hdr->flow_lbl[2] == 0)) { + (hdr->flow_lbl[1] == 0) && (hdr->flow_lbl[2] == 0)) { /* flow label can be compressed */ iphc0 |= LOWPAN_IPHC_FL_C; if ((hdr->priority == 0) && - ((hdr->flow_lbl[0] & 0xF0) == 0)) { + ((hdr->flow_lbl[0] & 0xF0) == 0)) { /* compress (elide) all */ iphc0 |= LOWPAN_IPHC_TC_C; } else { /* compress only the flow label */ - *hc06_ptr = tmp; - hc06_ptr += 1; + *hc_ptr = tmp; + hc_ptr += 1; } } else { /* Flow label cannot be compressed */ if ((hdr->priority == 0) && - ((hdr->flow_lbl[0] & 0xF0) == 0)) { + ((hdr->flow_lbl[0] & 0xF0) == 0)) { /* compress only traffic class */ iphc0 |= LOWPAN_IPHC_TC_C; - *hc06_ptr = (tmp & 0xc0) | (hdr->flow_lbl[0] & 0x0F); - memcpy(hc06_ptr + 1, &hdr->flow_lbl[1], 2); - hc06_ptr += 3; + *hc_ptr = (tmp & 0xc0) | (hdr->flow_lbl[0] & 0x0F); + memcpy(hc_ptr + 1, &hdr->flow_lbl[1], 2); + hc_ptr += 3; } else { /* compress nothing */ - memcpy(hc06_ptr, hdr, 4); + memcpy(hc_ptr, hdr, 4); /* replace the top byte with new ECN | DSCP format */ - *hc06_ptr = tmp; - hc06_ptr += 4; + *hc_ptr = tmp; + hc_ptr += 4; } } @@ -690,13 +678,11 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, if (hdr->nexthdr == UIP_PROTO_UDP) iphc0 |= LOWPAN_IPHC_NH_C; - if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) { - *hc06_ptr = hdr->nexthdr; - hc06_ptr += 1; - } + if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) + lowpan_push_hc_data(&hc_ptr, &hdr->nexthdr, + sizeof(hdr->nexthdr)); - /* - * Hop limit + /* Hop limit * if 1: compress, encoding is 01 * if 64: compress, encoding is 10 * if 255: compress, encoding is 11 @@ -713,87 +699,89 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, iphc0 |= LOWPAN_IPHC_TTL_255; break; default: - *hc06_ptr = hdr->hop_limit; - hc06_ptr += 1; - break; + lowpan_push_hc_data(&hc_ptr, &hdr->hop_limit, + sizeof(hdr->hop_limit)); } + addr_type = ipv6_addr_type(&hdr->saddr); /* source address compression */ - if (is_addr_unspecified(&hdr->saddr)) { + if (addr_type == IPV6_ADDR_ANY) { pr_debug("source address is unspecified, setting SAC\n"); iphc1 |= LOWPAN_IPHC_SAC; - /* TODO: context lookup */ - } else if (is_addr_link_local(&hdr->saddr)) { - iphc1 |= lowpan_compress_addr_64(&hc06_ptr, - LOWPAN_IPHC_SAM_BIT, &hdr->saddr, _saddr); - pr_debug("source address unicast link-local %pI6c " - "iphc1 0x%02x\n", &hdr->saddr, iphc1); } else { - pr_debug("send the full source address\n"); - memcpy(hc06_ptr, &hdr->saddr.s6_addr16[0], 16); - hc06_ptr += 16; + if (addr_type & IPV6_ADDR_LINKLOCAL) { + iphc1 |= lowpan_compress_addr_64(&hc_ptr, + LOWPAN_IPHC_SAM_BIT, + &hdr->saddr, _saddr); + pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n", + &hdr->saddr, iphc1); + } else { + pr_debug("send the full source address\n"); + lowpan_push_hc_data(&hc_ptr, hdr->saddr.s6_addr, 16); + } } + addr_type = ipv6_addr_type(&hdr->daddr); /* destination address compression */ - if (is_addr_mcast(&hdr->daddr)) { + if (addr_type & IPV6_ADDR_MULTICAST) { pr_debug("destination address is multicast: "); iphc1 |= LOWPAN_IPHC_M; if (lowpan_is_mcast_addr_compressable8(&hdr->daddr)) { pr_debug("compressed to 1 octet\n"); iphc1 |= LOWPAN_IPHC_DAM_11; /* use last byte */ - *hc06_ptr = hdr->daddr.s6_addr[15]; - hc06_ptr += 1; + lowpan_push_hc_data(&hc_ptr, + &hdr->daddr.s6_addr[15], 1); } else if (lowpan_is_mcast_addr_compressable32(&hdr->daddr)) { pr_debug("compressed to 4 octets\n"); iphc1 |= LOWPAN_IPHC_DAM_10; /* second byte + the last three */ - *hc06_ptr = hdr->daddr.s6_addr[1]; - memcpy(hc06_ptr + 1, &hdr->daddr.s6_addr[13], 3); - hc06_ptr += 4; + lowpan_push_hc_data(&hc_ptr, + &hdr->daddr.s6_addr[1], 1); + lowpan_push_hc_data(&hc_ptr, + &hdr->daddr.s6_addr[13], 3); } else if (lowpan_is_mcast_addr_compressable48(&hdr->daddr)) { pr_debug("compressed to 6 octets\n"); iphc1 |= LOWPAN_IPHC_DAM_01; /* second byte + the last five */ - *hc06_ptr = hdr->daddr.s6_addr[1]; - memcpy(hc06_ptr + 1, &hdr->daddr.s6_addr[11], 5); - hc06_ptr += 6; + lowpan_push_hc_data(&hc_ptr, + &hdr->daddr.s6_addr[1], 1); + lowpan_push_hc_data(&hc_ptr, + &hdr->daddr.s6_addr[11], 5); } else { pr_debug("using full address\n"); iphc1 |= LOWPAN_IPHC_DAM_00; - memcpy(hc06_ptr, &hdr->daddr.s6_addr[0], 16); - hc06_ptr += 16; + lowpan_push_hc_data(&hc_ptr, hdr->daddr.s6_addr, 16); } } else { - /* TODO: context lookup */ - if (is_addr_link_local(&hdr->daddr)) { - iphc1 |= lowpan_compress_addr_64(&hc06_ptr, + if (addr_type & IPV6_ADDR_LINKLOCAL) { + /* TODO: context lookup */ + iphc1 |= lowpan_compress_addr_64(&hc_ptr, LOWPAN_IPHC_DAM_BIT, &hdr->daddr, _daddr); pr_debug("dest address unicast link-local %pI6c " - "iphc1 0x%02x\n", &hdr->daddr, iphc1); + "iphc1 0x%02x\n", &hdr->daddr, iphc1); } else { pr_debug("dest address unicast %pI6c\n", &hdr->daddr); - memcpy(hc06_ptr, &hdr->daddr.s6_addr16[0], 16); - hc06_ptr += 16; + lowpan_push_hc_data(&hc_ptr, hdr->daddr.s6_addr, 16); } } /* UDP header compression */ if (hdr->nexthdr == UIP_PROTO_UDP) - compress_udp_header(&hc06_ptr, skb); + compress_udp_header(&hc_ptr, skb); head[0] = iphc0; head[1] = iphc1; skb_pull(skb, sizeof(struct ipv6hdr)); skb_reset_transport_header(skb); - memcpy(skb_push(skb, hc06_ptr - head), head, hc06_ptr - head); + memcpy(skb_push(skb, hc_ptr - head), head, hc_ptr - head); skb_reset_network_header(skb); - pr_debug("header len %d skb %u\n", (int)(hc06_ptr - head), skb->len); + pr_debug("header len %d skb %u\n", (int)(hc_ptr - head), skb->len); raw_dump_table(__func__, "raw skb data dump compressed", - skb->data, skb->len); + skb->data, skb->len); return 0; } EXPORT_SYMBOL_GPL(lowpan_header_compress); diff --git a/net/802/fc.c b/net/802/fc.c index 05eea6b98bb8..7c174b6750cd 100644 --- a/net/802/fc.c +++ b/net/802/fc.c @@ -126,6 +126,6 @@ static void fc_setup(struct net_device *dev) */ struct net_device *alloc_fcdev(int sizeof_priv) { - return alloc_netdev(sizeof_priv, "fc%d", fc_setup); + return alloc_netdev(sizeof_priv, "fc%d", NET_NAME_UNKNOWN, fc_setup); } EXPORT_SYMBOL(alloc_fcdev); diff --git a/net/802/fddi.c b/net/802/fddi.c index 9cda40661e0d..59e7346f1193 100644 --- a/net/802/fddi.c +++ b/net/802/fddi.c @@ -207,7 +207,8 @@ static void fddi_setup(struct net_device *dev) */ struct net_device *alloc_fddidev(int sizeof_priv) { - return alloc_netdev(sizeof_priv, "fddi%d", fddi_setup); + return alloc_netdev(sizeof_priv, "fddi%d", NET_NAME_UNKNOWN, + fddi_setup); } EXPORT_SYMBOL(alloc_fddidev); diff --git a/net/802/hippi.c b/net/802/hippi.c index 5ff2a718ddca..2e03f8259dd5 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -228,7 +228,8 @@ static void hippi_setup(struct net_device *dev) struct net_device *alloc_hippi_dev(int sizeof_priv) { - return alloc_netdev(sizeof_priv, "hip%d", hippi_setup); + return alloc_netdev(sizeof_priv, "hip%d", NET_NAME_UNKNOWN, + hippi_setup); } EXPORT_SYMBOL(alloc_hippi_dev); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 44ebd5c2cd4a..64c6bed4a3d3 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -250,7 +250,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } - new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, vlan_setup); + new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, + NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; @@ -324,23 +325,24 @@ static void vlan_transfer_features(struct net_device *dev, netdev_update_features(vlandev); } -static void __vlan_device_event(struct net_device *dev, unsigned long event) +static int __vlan_device_event(struct net_device *dev, unsigned long event) { + int err = 0; + switch (event) { case NETDEV_CHANGENAME: vlan_proc_rem_dev(dev); - if (vlan_proc_add_dev(dev) < 0) - pr_warn("failed to change proc name for %s\n", - dev->name); + err = vlan_proc_add_dev(dev); break; case NETDEV_REGISTER: - if (vlan_proc_add_dev(dev) < 0) - pr_warn("failed to add proc entry for %s\n", dev->name); + err = vlan_proc_add_dev(dev); break; case NETDEV_UNREGISTER: vlan_proc_rem_dev(dev); break; } + + return err; } static int vlan_device_event(struct notifier_block *unused, unsigned long event, @@ -355,8 +357,12 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, bool last = false; LIST_HEAD(list); - if (is_vlan_dev(dev)) - __vlan_device_event(dev, event); + if (is_vlan_dev(dev)) { + int err = __vlan_device_event(dev, event); + + if (err) + return notifier_from_errno(err); + } if ((event == NETDEV_UP) && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 75d427763992..90cc2bdd4064 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -112,59 +112,6 @@ __be16 vlan_dev_vlan_proto(const struct net_device *dev) } EXPORT_SYMBOL(vlan_dev_vlan_proto); -static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) -{ - if (skb_cow(skb, skb_headroom(skb)) < 0) { - kfree_skb(skb); - return NULL; - } - - memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); - skb->mac_header += VLAN_HLEN; - return skb; -} - -struct sk_buff *vlan_untag(struct sk_buff *skb) -{ - struct vlan_hdr *vhdr; - u16 vlan_tci; - - if (unlikely(vlan_tx_tag_present(skb))) { - /* vlan_tci is already set-up so leave this for another time */ - return skb; - } - - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(!skb)) - goto err_free; - - if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) - goto err_free; - - vhdr = (struct vlan_hdr *) skb->data; - vlan_tci = ntohs(vhdr->h_vlan_TCI); - __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); - - skb_pull_rcsum(skb, VLAN_HLEN); - vlan_set_encap_proto(skb, vhdr); - - skb = vlan_reorder_header(skb); - if (unlikely(!skb)) - goto err_free; - - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - skb_reset_mac_len(skb); - - return skb; - -err_free: - kfree_skb(skb); - return NULL; -} -EXPORT_SYMBOL(vlan_untag); - - /* * vlan info and vid list */ diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index dd11f612e03e..0d441ec8763e 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -385,6 +385,8 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: + case SIOCSHWTSTAMP: + case SIOCGHWTSTAMP: if (netif_device_present(real_dev) && ops->ndo_do_ioctl) err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd); break; @@ -797,7 +799,8 @@ void vlan_setup(struct net_device *dev) ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN; - dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + netif_keep_dst(dev); dev->tx_queue_len = 0; dev->netdev_ops = &vlan_netdev_ops; diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index 1d0e89213a28..ae63cf72a953 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -171,6 +171,8 @@ int vlan_proc_add_dev(struct net_device *vlandev) struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id); + if (!strcmp(vlandev->name, name_conf)) + return -EINVAL; vlan->dent = proc_create_data(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR, vn->proc_vlan_dir, &vlandev_fops, vlandev); diff --git a/net/9p/client.c b/net/9p/client.c index 0004cbaac4a4..e86a9bea1d16 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -959,7 +959,6 @@ static int p9_client_version(struct p9_client *c) break; default: return -EINVAL; - break; } if (IS_ERR(req)) diff --git a/net/Kconfig b/net/Kconfig index d92afe4204d9..6272420a721b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -6,6 +6,7 @@ menuconfig NET bool "Networking support" select NLATTR select GENERIC_NET_UTILS + select ANON_INODES ---help--- Unless you really know what you are doing, you should say Y here. The reason is that some programs need kernel networking support even @@ -176,10 +177,11 @@ config NETFILTER_ADVANCED If unsure, say Y. config BRIDGE_NETFILTER - bool "Bridged IP/ARP packets filtering" - depends on BRIDGE && NETFILTER && INET + tristate "Bridged IP/ARP packets filtering" + depends on BRIDGE + depends on NETFILTER && INET depends on NETFILTER_ADVANCED - default y + default m ---help--- Enabling this option will let arptables resp. iptables see bridged ARP resp. IP traffic. If you want a bridging firewall, you probably @@ -214,6 +216,7 @@ source "drivers/net/appletalk/Kconfig" source "net/x25/Kconfig" source "net/lapb/Kconfig" source "net/phonet/Kconfig" +source "net/6lowpan/Kconfig" source "net/ieee802154/Kconfig" source "net/mac802154/Kconfig" source "net/sched/Kconfig" diff --git a/net/Makefile b/net/Makefile index cbbbe6d657ca..7ed1970074b0 100644 --- a/net/Makefile +++ b/net/Makefile @@ -57,7 +57,8 @@ obj-$(CONFIG_CAIF) += caif/ ifneq ($(CONFIG_DCB),) obj-y += dcb/ endif -obj-y += ieee802154/ +obj-$(CONFIG_6LOWPAN) += 6lowpan/ +obj-$(CONFIG_IEEE802154) += ieee802154/ obj-$(CONFIG_MAC802154) += mac802154/ ifeq ($(CONFIG_NET),y) diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index bfcf6be1d665..c00897f65a31 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1805,7 +1805,7 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) long amount = 0; if (skb) - amount = skb->len - sizeof(struct ddpehdr); + amount = skb->len - sizeof(struct ddpehdr); rc = put_user(amount, (int __user *)argp); break; } diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c index 6c8016f61866..e4158b8b926d 100644 --- a/net/appletalk/dev.c +++ b/net/appletalk/dev.c @@ -39,6 +39,7 @@ static void ltalk_setup(struct net_device *dev) struct net_device *alloc_ltalkdev(int sizeof_priv) { - return alloc_netdev(sizeof_priv, "lt%d", ltalk_setup); + return alloc_netdev(sizeof_priv, "lt%d", NET_NAME_UNKNOWN, + ltalk_setup); } EXPORT_SYMBOL(alloc_ltalkdev); diff --git a/net/atm/br2684.c b/net/atm/br2684.c index 403e71fa88fe..cc78538d163b 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -682,8 +682,8 @@ static int br2684_create(void __user *arg) netdev = alloc_netdev(sizeof(struct br2684_dev), ni.ifname[0] ? ni.ifname : "nas%d", - (payload == p_routed) ? - br2684_setup_routed : br2684_setup); + NET_NAME_UNKNOWN, + (payload == p_routed) ? br2684_setup_routed : br2684_setup); if (!netdev) return -ENOMEM; diff --git a/net/atm/clip.c b/net/atm/clip.c index ba291ce4bdff..17e55dfecbe2 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -384,7 +384,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev); old = xchg(&entry->vccs->xoff, 1); /* assume XOFF ... */ if (old) { - pr_warning("XOFF->XOFF transition\n"); + pr_warn("XOFF->XOFF transition\n"); goto out_release_neigh; } dev->stats.tx_packets++; @@ -447,7 +447,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip) struct rtable *rt; if (vcc->push != clip_push) { - pr_warning("non-CLIP VCC\n"); + pr_warn("non-CLIP VCC\n"); return -EBADF; } clip_vcc = CLIP_VCC(vcc); @@ -501,7 +501,7 @@ static void clip_setup(struct net_device *dev) /* without any more elaborate queuing. 100 is a reasonable */ /* compromise between decent burst-tolerance and protection */ /* against memory hogs. */ - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int clip_create(int number) @@ -520,7 +520,8 @@ static int clip_create(int number) if (PRIV(dev)->number >= number) number = PRIV(dev)->number + 1; } - dev = alloc_netdev(sizeof(struct clip_priv), "", clip_setup); + dev = alloc_netdev(sizeof(struct clip_priv), "", NET_NAME_UNKNOWN, + clip_setup); if (!dev) return -ENOMEM; clip_priv = PRIV(dev); diff --git a/net/atm/common.c b/net/atm/common.c index 7b491006eaf4..6a765156a3f6 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -300,7 +300,7 @@ static int adjust_tp(struct atm_trafprm *tp, unsigned char aal) max_sdu = ATM_MAX_AAL34_PDU; break; default: - pr_warning("AAL problems ... (%d)\n", aal); + pr_warn("AAL problems ... (%d)\n", aal); /* fall through */ case ATM_AAL5: max_sdu = ATM_MAX_AAL5_PDU; diff --git a/net/atm/lec.c b/net/atm/lec.c index 4c5b8ba0f84f..4b98f897044a 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -410,9 +410,11 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) priv->lane2_ops = NULL; if (priv->lane_version > 1) priv->lane2_ops = &lane2_ops; + rtnl_lock(); if (dev_set_mtu(dev, mesg->content.config.mtu)) pr_info("%s: change_mtu to %d failed\n", dev->name, mesg->content.config.mtu); + rtnl_unlock(); priv->is_proxy = mesg->content.config.is_proxy; break; case l_flush_tran_id: @@ -833,7 +835,6 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl, loff_t *l) { struct hlist_node *e = state->node; - struct lec_arp_table *tmp; if (!e) e = tbl->first; @@ -842,9 +843,7 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl, --*l; } - tmp = container_of(e, struct lec_arp_table, next); - - hlist_for_each_entry_from(tmp, next) { + for (; e; e = e->next) { if (--*l < 0) break; } diff --git a/net/atm/mpc.c b/net/atm/mpc.c index e8e0e7a8a23d..0e982222d425 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -599,7 +599,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb, } non_ip: - return mpc->old_ops->ndo_start_xmit(skb, dev); + return __netdev_start_xmit(mpc->old_ops, skb, dev, false); } static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) diff --git a/net/atm/svc.c b/net/atm/svc.c index d8e5d0c2ebbc..1ba23f5018e7 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -50,12 +50,12 @@ static void svc_disconnect(struct atm_vcc *vcc) pr_debug("%p\n", vcc); if (test_bit(ATM_VF_REGIS, &vcc->flags)) { - prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); sigd_enq(vcc, as_close, NULL, NULL, NULL); - while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) { + for (;;) { + prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + if (test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) + break; schedule(); - prepare_to_wait(sk_sleep(sk), &wait, - TASK_UNINTERRUPTIBLE); } finish_wait(sk_sleep(sk), &wait); } @@ -126,11 +126,12 @@ static int svc_bind(struct socket *sock, struct sockaddr *sockaddr, } vcc->local = *addr; set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local); - while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { - schedule(); + for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) + break; + schedule(); } finish_wait(sk_sleep(sk), &wait); clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */ @@ -202,15 +203,14 @@ static int svc_connect(struct socket *sock, struct sockaddr *sockaddr, } vcc->remote = *addr; set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote); if (flags & O_NONBLOCK) { - finish_wait(sk_sleep(sk), &wait); sock->state = SS_CONNECTING; error = -EINPROGRESS; goto out; } error = 0; + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { schedule(); if (!signal_pending(current)) { @@ -297,11 +297,12 @@ static int svc_listen(struct socket *sock, int backlog) goto out; } set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local); - while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { - schedule(); + for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) + break; + schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) { @@ -387,15 +388,15 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags) } /* wait should be short, so we ignore the non-blocking flag */ set_bit(ATM_VF_WAITING, &new_vcc->flags); - prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait, - TASK_UNINTERRUPTIBLE); sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL); - while (test_bit(ATM_VF_WAITING, &new_vcc->flags) && sigd) { + for (;;) { + prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait, + TASK_UNINTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &new_vcc->flags) || !sigd) + break; release_sock(sk); schedule(); lock_sock(sk); - prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait, - TASK_UNINTERRUPTIBLE); } finish_wait(sk_sleep(sk_atm(new_vcc)), &wait); if (!sigd) { @@ -433,12 +434,14 @@ int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) DEFINE_WAIT(wait); set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0); - while (test_bit(ATM_VF_WAITING, &vcc->flags) && - !test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) { - schedule(); + for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &vcc->flags) || + test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) { + break; + } + schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) @@ -529,18 +532,18 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr, lock_sock(sk); set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); sigd_enq(vcc, as_addparty, NULL, NULL, (struct sockaddr_atmsvc *) sockaddr); if (flags & O_NONBLOCK) { - finish_wait(sk_sleep(sk), &wait); error = -EINPROGRESS; goto out; } pr_debug("added wait queue\n"); - while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { - schedule(); + for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) + break; + schedule(); } finish_wait(sk_sleep(sk), &wait); error = xchg(&sk->sk_err_soft, 0); @@ -558,11 +561,12 @@ static int svc_dropparty(struct socket *sock, int ep_ref) lock_sock(sk); set_bit(ATM_VF_WAITING, &vcc->flags); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref); - while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { - schedule(); + for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) + break; + schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) { diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f04224c32005..1e8053976e83 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -108,14 +108,15 @@ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, int max_if_num) { void *data_ptr; - size_t data_size, old_size; + size_t old_size; int ret = -ENOMEM; spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); - data_size = max_if_num * sizeof(unsigned long) * BATADV_NUM_WORDS; old_size = (max_if_num - 1) * sizeof(unsigned long) * BATADV_NUM_WORDS; - data_ptr = kmalloc(data_size, GFP_ATOMIC); + data_ptr = kmalloc_array(max_if_num, + BATADV_NUM_WORDS * sizeof(unsigned long), + GFP_ATOMIC); if (!data_ptr) goto unlock; @@ -123,7 +124,7 @@ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, kfree(orig_node->bat_iv.bcast_own); orig_node->bat_iv.bcast_own = data_ptr; - data_ptr = kmalloc(max_if_num * sizeof(uint8_t), GFP_ATOMIC); + data_ptr = kmalloc_array(max_if_num, sizeof(uint8_t), GFP_ATOMIC); if (!data_ptr) { kfree(orig_node->bat_iv.bcast_own); goto unlock; @@ -164,7 +165,7 @@ static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, goto free_bcast_own; chunk_size = sizeof(unsigned long) * BATADV_NUM_WORDS; - data_ptr = kmalloc(max_if_num * chunk_size, GFP_ATOMIC); + data_ptr = kmalloc_array(max_if_num, chunk_size, GFP_ATOMIC); if (!data_ptr) goto unlock; @@ -183,7 +184,7 @@ free_bcast_own: if (max_if_num == 0) goto free_own_sum; - data_ptr = kmalloc(max_if_num * sizeof(uint8_t), GFP_ATOMIC); + data_ptr = kmalloc_array(max_if_num, sizeof(uint8_t), GFP_ATOMIC); if (!data_ptr) { kfree(orig_node->bat_iv.bcast_own); goto unlock; diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index f2c066b21716..b5981113c9a7 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -537,7 +537,8 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) if (!bat_priv->orig_hash) return NULL; - res = kmalloc(BATADV_DAT_CANDIDATES_NUM * sizeof(*res), GFP_ATOMIC); + res = kmalloc_array(BATADV_DAT_CANDIDATES_NUM, sizeof(*res), + GFP_ATOMIC); if (!res) return NULL; diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index f14e54a05691..fc1835c6bb40 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -128,6 +128,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, { struct batadv_frag_table_entry *chain; struct batadv_frag_list_entry *frag_entry_new = NULL, *frag_entry_curr; + struct batadv_frag_list_entry *frag_entry_last = NULL; struct batadv_frag_packet *frag_packet; uint8_t bucket; uint16_t seqno, hdr_size = sizeof(struct batadv_frag_packet); @@ -180,11 +181,14 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, ret = true; goto out; } + + /* store current entry because it could be the last in list */ + frag_entry_last = frag_entry_curr; } - /* Reached the end of the list, so insert after 'frag_entry_curr'. */ - if (likely(frag_entry_curr)) { - hlist_add_after(&frag_entry_curr->list, &frag_entry_new->list); + /* Reached the end of the list, so insert after 'frag_entry_last'. */ + if (likely(frag_entry_last)) { + hlist_add_behind(&frag_entry_new->list, &frag_entry_last->list); chain->size += skb->len - hdr_size; chain->timestamp = jiffies; ret = true; diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 63bdf7e94f1e..7c1c63080e20 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -46,12 +46,12 @@ struct batadv_hashtable *batadv_hash_new(uint32_t size) if (!hash) return NULL; - hash->table = kmalloc(sizeof(*hash->table) * size, GFP_ATOMIC); + hash->table = kmalloc_array(size, sizeof(*hash->table), GFP_ATOMIC); if (!hash->table) goto free_hash; - hash->list_locks = kmalloc(sizeof(*hash->list_locks) * size, - GFP_ATOMIC); + hash->list_locks = kmalloc_array(size, sizeof(*hash->list_locks), + GFP_ATOMIC); if (!hash->list_locks) goto free_table; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 118b990bae25..a1fcd884f0b1 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2014.3.0" +#define BATADV_SOURCE_VERSION "2014.4.0" #endif /* B.A.T.M.A.N. parameters */ @@ -238,21 +238,29 @@ enum batadv_dbg_level { int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) __printf(2, 3); -#define batadv_dbg(type, bat_priv, fmt, arg...) \ +/* possibly ratelimited debug output */ +#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \ do { \ - if (atomic_read(&bat_priv->log_level) & type) \ + if (atomic_read(&bat_priv->log_level) & type && \ + (!ratelimited || net_ratelimit())) \ batadv_debug_log(bat_priv, fmt, ## arg);\ } \ while (0) #else /* !CONFIG_BATMAN_ADV_DEBUG */ -__printf(3, 4) -static inline void batadv_dbg(int type __always_unused, - struct batadv_priv *bat_priv __always_unused, - const char *fmt __always_unused, ...) +__printf(4, 5) +static inline void _batadv_dbg(int type __always_unused, + struct batadv_priv *bat_priv __always_unused, + int ratelimited __always_unused, + const char *fmt __always_unused, ...) { } #endif +#define batadv_dbg(type, bat_priv, arg...) \ + _batadv_dbg(type, bat_priv, 0, ## arg) +#define batadv_dbg_ratelimited(type, bat_priv, arg...) \ + _batadv_dbg(type, bat_priv, 1, ## arg) + #define batadv_info(net_dev, fmt, arg...) \ do { \ struct net_device *_netdev = (net_dev); \ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 96b66fd30f96..ab6bb2af1d45 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -20,7 +20,6 @@ #include "originator.h" #include "hard-interface.h" #include "translation-table.h" -#include "multicast.h" /** * batadv_mcast_mla_softif_get - get softif multicast listeners diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 35141534938e..35f76f2f7824 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -706,11 +706,11 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, if (batadv_tt_local_client_is_roaming(bat_priv, ethhdr->h_dest, vid)) { if (batadv_reroute_unicast_packet(bat_priv, unicast_packet, ethhdr->h_dest, vid)) - net_ratelimited_function(batadv_dbg, BATADV_DBG_TT, - bat_priv, - "Rerouting unicast packet to %pM (dst=%pM): Local Roaming\n", - unicast_packet->dest, - ethhdr->h_dest); + batadv_dbg_ratelimited(BATADV_DBG_TT, + bat_priv, + "Rerouting unicast packet to %pM (dst=%pM): Local Roaming\n", + unicast_packet->dest, + ethhdr->h_dest); /* at this point the mesh destination should have been * substituted with the originator address found in the global * table. If not, let the packet go untouched anyway because @@ -752,10 +752,10 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, */ if (batadv_reroute_unicast_packet(bat_priv, unicast_packet, ethhdr->h_dest, vid)) { - net_ratelimited_function(batadv_dbg, BATADV_DBG_TT, bat_priv, - "Rerouting unicast packet to %pM (dst=%pM): TTVN mismatch old_ttvn=%u new_ttvn=%u\n", - unicast_packet->dest, ethhdr->h_dest, - old_ttvn, curr_ttvn); + batadv_dbg_ratelimited(BATADV_DBG_TT, bat_priv, + "Rerouting unicast packet to %pM (dst=%pM): TTVN mismatch old_ttvn=%u new_ttvn=%u\n", + unicast_packet->dest, ethhdr->h_dest, + old_ttvn, curr_ttvn); return 1; } diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index cbd677f48c00..5467955eb27c 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -751,7 +751,7 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); - atomic_set(&bat_priv->hop_penalty, 15); + atomic_set(&bat_priv->hop_penalty, 30); #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_set(&bat_priv->log_level, 0); #endif @@ -927,7 +927,7 @@ struct net_device *batadv_softif_create(const char *name) int ret; soft_iface = alloc_netdev(sizeof(struct batadv_priv), name, - batadv_softif_init_early); + NET_NAME_UNKNOWN, batadv_softif_init_early); if (!soft_iface) return NULL; diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index fc47baa888c5..f40cb0436eba 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -900,32 +900,24 @@ int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, bat_kobj = &bat_priv->soft_iface->dev.kobj; - uevent_env[0] = kmalloc(strlen(BATADV_UEV_TYPE_VAR) + - strlen(batadv_uev_type_str[type]) + 1, - GFP_ATOMIC); + uevent_env[0] = kasprintf(GFP_ATOMIC, + "%s%s", BATADV_UEV_TYPE_VAR, + batadv_uev_type_str[type]); if (!uevent_env[0]) goto out; - sprintf(uevent_env[0], "%s%s", BATADV_UEV_TYPE_VAR, - batadv_uev_type_str[type]); - - uevent_env[1] = kmalloc(strlen(BATADV_UEV_ACTION_VAR) + - strlen(batadv_uev_action_str[action]) + 1, - GFP_ATOMIC); + uevent_env[1] = kasprintf(GFP_ATOMIC, + "%s%s", BATADV_UEV_ACTION_VAR, + batadv_uev_action_str[action]); if (!uevent_env[1]) goto out; - sprintf(uevent_env[1], "%s%s", BATADV_UEV_ACTION_VAR, - batadv_uev_action_str[action]); - /* If the event is DEL, ignore the data field */ if (action != BATADV_UEV_DEL) { - uevent_env[2] = kmalloc(strlen(BATADV_UEV_DATA_VAR) + - strlen(data) + 1, GFP_ATOMIC); + uevent_env[2] = kasprintf(GFP_ATOMIC, + "%s%s", BATADV_UEV_DATA_VAR, data); if (!uevent_env[2]) goto out; - - sprintf(uevent_env[2], "%s%s", BATADV_UEV_DATA_VAR, data); } ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env); diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 8796ffa08b43..c2e0d14433df 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2013 Intel Corp. + Copyright (c) 2013-2014 Intel Corp. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 and @@ -14,6 +14,8 @@ #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/module.h> +#include <linux/debugfs.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -25,16 +27,21 @@ #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> -#include "6lowpan.h" - #include <net/6lowpan.h> /* for the compression support */ +#define VERSION "0.1" + +static struct dentry *lowpan_psm_debugfs; +static struct dentry *lowpan_control_debugfs; + #define IFACE_NAME_TEMPLATE "bt%d" #define EUI64_ADDR_LEN 8 struct skb_cb { struct in6_addr addr; - struct l2cap_conn *conn; + struct in6_addr gw; + struct l2cap_chan *chan; + int status; }; #define lowpan_cb(skb) ((struct skb_cb *)((skb)->cb)) @@ -48,9 +55,19 @@ struct skb_cb { static LIST_HEAD(bt_6lowpan_devices); static DEFINE_RWLOCK(devices_lock); +/* If psm is set to 0 (default value), then 6lowpan is disabled. + * Other values are used to indicate a Protocol Service Multiplexer + * value for 6lowpan. + */ +static u16 psm_6lowpan; + +/* We are listening incoming connections via this channel + */ +static struct l2cap_chan *listen_chan; + struct lowpan_peer { struct list_head list; - struct l2cap_conn *conn; + struct l2cap_chan *chan; /* peer addresses in various formats */ unsigned char eui64_addr[EUI64_ADDR_LEN]; @@ -84,6 +101,8 @@ static inline bool peer_del(struct lowpan_dev *dev, struct lowpan_peer *peer) { list_del(&peer->list); + module_put(THIS_MODULE); + if (atomic_dec_and_test(&dev->peer_count)) { BT_DBG("last peer"); return true; @@ -101,13 +120,26 @@ static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_dev *dev, ba, type); list_for_each_entry_safe(peer, tmp, &dev->peers, list) { - BT_DBG("addr %pMR type %d", - &peer->conn->hcon->dst, peer->conn->hcon->dst_type); + BT_DBG("dst addr %pMR dst type %d", + &peer->chan->dst, peer->chan->dst_type); - if (bacmp(&peer->conn->hcon->dst, ba)) + if (bacmp(&peer->chan->dst, ba)) continue; - if (type == peer->conn->hcon->dst_type) + if (type == peer->chan->dst_type) + return peer; + } + + return NULL; +} + +static inline struct lowpan_peer *peer_lookup_chan(struct lowpan_dev *dev, + struct l2cap_chan *chan) +{ + struct lowpan_peer *peer, *tmp; + + list_for_each_entry_safe(peer, tmp, &dev->peers, list) { + if (peer->chan == chan) return peer; } @@ -120,7 +152,55 @@ static inline struct lowpan_peer *peer_lookup_conn(struct lowpan_dev *dev, struct lowpan_peer *peer, *tmp; list_for_each_entry_safe(peer, tmp, &dev->peers, list) { - if (peer->conn == conn) + if (peer->chan->conn == conn) + return peer; + } + + return NULL; +} + +static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev, + struct in6_addr *daddr, + struct sk_buff *skb) +{ + struct lowpan_peer *peer, *tmp; + struct in6_addr *nexthop; + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + int count = atomic_read(&dev->peer_count); + + BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt); + + /* If we have multiple 6lowpan peers, then check where we should + * send the packet. If only one peer exists, then we can send the + * packet right away. + */ + if (count == 1) + return list_first_entry(&dev->peers, struct lowpan_peer, + list); + + if (!rt) { + nexthop = &lowpan_cb(skb)->gw; + + if (ipv6_addr_any(nexthop)) + return NULL; + } else { + nexthop = rt6_nexthop(rt); + + /* We need to remember the address because it is needed + * by bt_xmit() when sending the packet. In bt_xmit(), the + * destination routing info is not set. + */ + memcpy(&lowpan_cb(skb)->gw, nexthop, sizeof(struct in6_addr)); + } + + BT_DBG("gw %pI6c", nexthop); + + list_for_each_entry_safe(peer, tmp, &dev->peers, list) { + BT_DBG("dst addr %pMR dst type %d ip %pI6c", + &peer->chan->dst, peer->chan->dst_type, + &peer->peer_addr); + + if (!ipv6_addr_cmp(&peer->peer_addr, nexthop)) return peer; } @@ -176,16 +256,16 @@ static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev) return -ENOMEM; ret = netif_rx(skb_cp); - - BT_DBG("receive skb %d", ret); - if (ret < 0) + if (ret < 0) { + BT_DBG("receive skb %d", ret); return NET_RX_DROP; + } return ret; } static int process_data(struct sk_buff *skb, struct net_device *netdev, - struct l2cap_conn *conn) + struct l2cap_chan *chan) { const u8 *saddr, *daddr; u8 iphc0, iphc1; @@ -196,7 +276,7 @@ static int process_data(struct sk_buff *skb, struct net_device *netdev, dev = lowpan_dev(netdev); read_lock_irqsave(&devices_lock, flags); - peer = peer_lookup_conn(dev, conn); + peer = peer_lookup_chan(dev, chan); read_unlock_irqrestore(&devices_lock, flags); if (!peer) goto drop; @@ -225,7 +305,7 @@ drop: } static int recv_pkt(struct sk_buff *skb, struct net_device *dev, - struct l2cap_conn *conn) + struct l2cap_chan *chan) { struct sk_buff *local_skb; int ret; @@ -269,7 +349,7 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, if (!local_skb) goto drop; - ret = process_data(local_skb, dev, conn); + ret = process_data(local_skb, dev, chan); if (ret != NET_RX_SUCCESS) goto drop; @@ -286,147 +366,39 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, return NET_RX_SUCCESS; drop: + dev->stats.rx_dropped++; kfree_skb(skb); return NET_RX_DROP; } /* Packet from BT LE device */ -int bt_6lowpan_recv(struct l2cap_conn *conn, struct sk_buff *skb) +static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { struct lowpan_dev *dev; struct lowpan_peer *peer; int err; - peer = lookup_peer(conn); + peer = lookup_peer(chan->conn); if (!peer) return -ENOENT; - dev = lookup_dev(conn); + dev = lookup_dev(chan->conn); if (!dev || !dev->netdev) return -ENOENT; - err = recv_pkt(skb, dev->netdev, conn); - BT_DBG("recv pkt %d", err); - - return err; -} - -static inline int skbuff_copy(void *msg, int len, int count, int mtu, - struct sk_buff *skb, struct net_device *dev) -{ - struct sk_buff **frag; - int sent = 0; - - memcpy(skb_put(skb, count), msg, count); - - sent += count; - msg += count; - len -= count; - - dev->stats.tx_bytes += count; - dev->stats.tx_packets++; - - raw_dump_table(__func__, "Sending", skb->data, skb->len); - - /* Continuation fragments (no L2CAP header) */ - frag = &skb_shinfo(skb)->frag_list; - while (len > 0) { - struct sk_buff *tmp; - - count = min_t(unsigned int, mtu, len); - - tmp = bt_skb_alloc(count, GFP_ATOMIC); - if (!tmp) - return -ENOMEM; - - *frag = tmp; - - memcpy(skb_put(*frag, count), msg, count); - - raw_dump_table(__func__, "Sending fragment", - (*frag)->data, count); - - (*frag)->priority = skb->priority; - - sent += count; - msg += count; - len -= count; - - skb->len += (*frag)->len; - skb->data_len += (*frag)->len; - - frag = &(*frag)->next; - - dev->stats.tx_bytes += count; - dev->stats.tx_packets++; - } - - return sent; -} - -static struct sk_buff *create_pdu(struct l2cap_conn *conn, void *msg, - size_t len, u32 priority, - struct net_device *dev) -{ - struct sk_buff *skb; - int err, count; - struct l2cap_hdr *lh; - - /* FIXME: This mtu check should be not needed and atm is only used for - * testing purposes - */ - if (conn->mtu > (L2CAP_LE_MIN_MTU + L2CAP_HDR_SIZE)) - conn->mtu = L2CAP_LE_MIN_MTU + L2CAP_HDR_SIZE; - - count = min_t(unsigned int, (conn->mtu - L2CAP_HDR_SIZE), len); - - BT_DBG("conn %p len %zu mtu %d count %d", conn, len, conn->mtu, count); - - skb = bt_skb_alloc(count + L2CAP_HDR_SIZE, GFP_ATOMIC); - if (!skb) - return ERR_PTR(-ENOMEM); - - skb->priority = priority; - - lh = (struct l2cap_hdr *)skb_put(skb, L2CAP_HDR_SIZE); - lh->cid = cpu_to_le16(L2CAP_FC_6LOWPAN); - lh->len = cpu_to_le16(len); - - err = skbuff_copy(msg, len, count, conn->mtu, skb, dev); - if (unlikely(err < 0)) { - kfree_skb(skb); - BT_DBG("skbuff copy %d failed", err); - return ERR_PTR(err); + err = recv_pkt(skb, dev->netdev, chan); + if (err) { + BT_DBG("recv pkt %d", err); + err = -EAGAIN; } - return skb; -} - -static int conn_send(struct l2cap_conn *conn, - void *msg, size_t len, u32 priority, - struct net_device *dev) -{ - struct sk_buff *skb; - - skb = create_pdu(conn, msg, len, priority, dev); - if (IS_ERR(skb)) - return -EINVAL; - - BT_DBG("conn %p skb %p len %d priority %u", conn, skb, skb->len, - skb->priority); - - hci_send_acl(conn->hchan, skb, ACL_START); - - return 0; + return err; } static u8 get_addr_type_from_eui64(u8 byte) { - /* Is universal(0) or local(1) bit, */ - if (byte & 0x02) - return ADDR_LE_DEV_RANDOM; - - return ADDR_LE_DEV_PUBLIC; + /* Is universal(0) or local(1) bit */ + return ((byte & 0x02) ? BDADDR_LE_RANDOM : BDADDR_LE_PUBLIC); } static void copy_to_bdaddr(struct in6_addr *ip6_daddr, bdaddr_t *addr) @@ -454,77 +426,132 @@ static void convert_dest_bdaddr(struct in6_addr *ip6_daddr, *addr_type = get_addr_type_from_eui64(addr->b[5]); } -static int header_create(struct sk_buff *skb, struct net_device *netdev, - unsigned short type, const void *_daddr, - const void *_saddr, unsigned int len) +static int setup_header(struct sk_buff *skb, struct net_device *netdev, + bdaddr_t *peer_addr, u8 *peer_addr_type) { - struct ipv6hdr *hdr; + struct in6_addr ipv6_daddr; struct lowpan_dev *dev; struct lowpan_peer *peer; bdaddr_t addr, *any = BDADDR_ANY; - u8 *saddr, *daddr = any->b; - u8 addr_type; - - if (type != ETH_P_IPV6) - return -EINVAL; - - hdr = ipv6_hdr(skb); + u8 *daddr = any->b; + int err, status = 0; dev = lowpan_dev(netdev); - if (ipv6_addr_is_multicast(&hdr->daddr)) { - memcpy(&lowpan_cb(skb)->addr, &hdr->daddr, - sizeof(struct in6_addr)); - lowpan_cb(skb)->conn = NULL; + memcpy(&ipv6_daddr, &lowpan_cb(skb)->addr, sizeof(ipv6_daddr)); + + if (ipv6_addr_is_multicast(&ipv6_daddr)) { + lowpan_cb(skb)->chan = NULL; } else { unsigned long flags; + u8 addr_type; /* Get destination BT device from skb. * If there is no such peer then discard the packet. */ - convert_dest_bdaddr(&hdr->daddr, &addr, &addr_type); + convert_dest_bdaddr(&ipv6_daddr, &addr, &addr_type); - BT_DBG("dest addr %pMR type %s IP %pI6c", &addr, - addr_type == ADDR_LE_DEV_PUBLIC ? "PUBLIC" : "RANDOM", - &hdr->daddr); + BT_DBG("dest addr %pMR type %d IP %pI6c", &addr, + addr_type, &ipv6_daddr); read_lock_irqsave(&devices_lock, flags); peer = peer_lookup_ba(dev, &addr, addr_type); read_unlock_irqrestore(&devices_lock, flags); if (!peer) { - BT_DBG("no such peer %pMR found", &addr); - return -ENOENT; + /* The packet might be sent to 6lowpan interface + * because of routing (either via default route + * or user set route) so get peer according to + * the destination address. + */ + read_lock_irqsave(&devices_lock, flags); + peer = peer_lookup_dst(dev, &ipv6_daddr, skb); + read_unlock_irqrestore(&devices_lock, flags); + if (!peer) { + BT_DBG("no such peer %pMR found", &addr); + return -ENOENT; + } } daddr = peer->eui64_addr; + *peer_addr = addr; + *peer_addr_type = addr_type; + lowpan_cb(skb)->chan = peer->chan; - memcpy(&lowpan_cb(skb)->addr, &hdr->daddr, - sizeof(struct in6_addr)); - lowpan_cb(skb)->conn = peer->conn; + status = 1; } - saddr = dev->netdev->dev_addr; + lowpan_header_compress(skb, netdev, ETH_P_IPV6, daddr, + dev->netdev->dev_addr, skb->len); + + err = dev_hard_header(skb, netdev, ETH_P_IPV6, NULL, NULL, 0); + if (err < 0) + return err; + + return status; +} + +static int header_create(struct sk_buff *skb, struct net_device *netdev, + unsigned short type, const void *_daddr, + const void *_saddr, unsigned int len) +{ + struct ipv6hdr *hdr; + + if (type != ETH_P_IPV6) + return -EINVAL; - return lowpan_header_compress(skb, netdev, type, daddr, saddr, len); + hdr = ipv6_hdr(skb); + + memcpy(&lowpan_cb(skb)->addr, &hdr->daddr, sizeof(struct in6_addr)); + + return 0; } /* Packet to BT LE device */ -static int send_pkt(struct l2cap_conn *conn, const void *saddr, - const void *daddr, struct sk_buff *skb, +static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, struct net_device *netdev) { - raw_dump_table(__func__, "raw skb data dump before fragmentation", - skb->data, skb->len); + struct msghdr msg; + struct kvec iv; + int err; + + /* Remember the skb so that we can send EAGAIN to the caller if + * we run out of credits. + */ + chan->data = skb; + + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = (struct iovec *) &iv; + msg.msg_iovlen = 1; + iv.iov_base = skb->data; + iv.iov_len = skb->len; + + err = l2cap_chan_send(chan, &msg, skb->len); + if (err > 0) { + netdev->stats.tx_bytes += err; + netdev->stats.tx_packets++; + return 0; + } + + if (!err) + err = lowpan_cb(skb)->status; + + if (err < 0) { + if (err == -EAGAIN) + netdev->stats.tx_dropped++; + else + netdev->stats.tx_errors++; + } - return conn_send(conn, skb->data, skb->len, 0, netdev); + return err; } -static void send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) +static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) { struct sk_buff *local_skb; struct lowpan_dev *entry, *tmp; unsigned long flags; + int err = 0; read_lock_irqsave(&devices_lock, flags); @@ -538,58 +565,77 @@ static void send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) dev = lowpan_dev(entry->netdev); list_for_each_entry_safe(pentry, ptmp, &dev->peers, list) { + int ret; + local_skb = skb_clone(skb, GFP_ATOMIC); - send_pkt(pentry->conn, netdev->dev_addr, - pentry->eui64_addr, local_skb, netdev); + BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p", + netdev->name, + &pentry->chan->dst, pentry->chan->dst_type, + &pentry->peer_addr, pentry->chan); + ret = send_pkt(pentry->chan, local_skb, netdev); + if (ret < 0) + err = ret; kfree_skb(local_skb); } } read_unlock_irqrestore(&devices_lock, flags); + + return err; } static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) { int err = 0; - unsigned char *eui64_addr; - struct lowpan_dev *dev; - struct lowpan_peer *peer; bdaddr_t addr; u8 addr_type; + struct sk_buff *tmpskb; - if (ipv6_addr_is_multicast(&lowpan_cb(skb)->addr)) { - /* We need to send the packet to every device - * behind this interface. - */ - send_mcast_pkt(skb, netdev); - } else { - unsigned long flags; - - convert_dest_bdaddr(&lowpan_cb(skb)->addr, &addr, &addr_type); - eui64_addr = lowpan_cb(skb)->addr.s6_addr + 8; - dev = lowpan_dev(netdev); - - read_lock_irqsave(&devices_lock, flags); - peer = peer_lookup_ba(dev, &addr, addr_type); - read_unlock_irqrestore(&devices_lock, flags); + /* We must take a copy of the skb before we modify/replace the ipv6 + * header as the header could be used elsewhere + */ + tmpskb = skb_unshare(skb, GFP_ATOMIC); + if (!tmpskb) { + kfree_skb(skb); + return NET_XMIT_DROP; + } + skb = tmpskb; - BT_DBG("xmit %s to %pMR type %s IP %pI6c peer %p", - netdev->name, &addr, - addr_type == ADDR_LE_DEV_PUBLIC ? "PUBLIC" : "RANDOM", - &lowpan_cb(skb)->addr, peer); + /* Return values from setup_header() + * <0 - error, packet is dropped + * 0 - this is a multicast packet + * 1 - this is unicast packet + */ + err = setup_header(skb, netdev, &addr, &addr_type); + if (err < 0) { + kfree_skb(skb); + return NET_XMIT_DROP; + } - if (peer && peer->conn) - err = send_pkt(peer->conn, netdev->dev_addr, - eui64_addr, skb, netdev); + if (err) { + if (lowpan_cb(skb)->chan) { + BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p", + netdev->name, &addr, addr_type, + &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan); + err = send_pkt(lowpan_cb(skb)->chan, skb, netdev); + } else { + err = -ENOENT; + } + } else { + /* We need to send the packet to every device behind this + * interface. + */ + err = send_mcast_pkt(skb, netdev); } + dev_kfree_skb(skb); if (err) BT_DBG("ERROR: xmit failed (%d)", err); - return (err < 0) ? NET_XMIT_DROP : err; + return err < 0 ? NET_XMIT_DROP : err; } static const struct net_device_ops netdev_ops = { @@ -609,7 +655,8 @@ static void netdev_setup(struct net_device *dev) dev->needed_tailroom = 0; dev->mtu = IPV6_MIN_MTU; dev->tx_queue_len = 0; - dev->flags = IFF_RUNNING | IFF_POINTOPOINT; + dev->flags = IFF_RUNNING | IFF_POINTOPOINT | + IFF_MULTICAST; dev->watchdog_timeo = 0; dev->netdev_ops = &netdev_ops; @@ -634,7 +681,7 @@ static void set_addr(u8 *eui, u8 *addr, u8 addr_type) eui[7] = addr[0]; /* Universal/local bit set, BT 6lowpan draft ch. 3.2.1 */ - if (addr_type == ADDR_LE_DEV_PUBLIC) + if (addr_type == BDADDR_LE_PUBLIC) eui[0] &= ~0x02; else eui[0] |= 0x02; @@ -660,6 +707,17 @@ static void ifup(struct net_device *netdev) rtnl_unlock(); } +static void ifdown(struct net_device *netdev) +{ + int err; + + rtnl_lock(); + err = dev_close(netdev); + if (err < 0) + BT_INFO("iface %s cannot be closed (%d)", netdev->name, err); + rtnl_unlock(); +} + static void do_notify_peers(struct work_struct *work) { struct lowpan_dev *dev = container_of(work, struct lowpan_dev, @@ -673,30 +731,81 @@ static bool is_bt_6lowpan(struct hci_conn *hcon) if (hcon->type != LE_LINK) return false; - return test_bit(HCI_CONN_6LOWPAN, &hcon->flags); + if (!psm_6lowpan) + return false; + + return true; +} + +static struct l2cap_chan *chan_create(void) +{ + struct l2cap_chan *chan; + + chan = l2cap_chan_create(); + if (!chan) + return NULL; + + l2cap_chan_set_defaults(chan); + + chan->chan_type = L2CAP_CHAN_CONN_ORIENTED; + chan->mode = L2CAP_MODE_LE_FLOWCTL; + chan->omtu = 65535; + chan->imtu = chan->omtu; + + return chan; +} + +static struct l2cap_chan *chan_open(struct l2cap_chan *pchan) +{ + struct l2cap_chan *chan; + + chan = chan_create(); + if (!chan) + return NULL; + + chan->remote_mps = chan->omtu; + chan->mps = chan->omtu; + + chan->state = BT_CONNECTED; + + return chan; } -static int add_peer_conn(struct l2cap_conn *conn, struct lowpan_dev *dev) +static void set_ip_addr_bits(u8 addr_type, u8 *addr) +{ + if (addr_type == BDADDR_LE_PUBLIC) + *addr |= 0x02; + else + *addr &= ~0x02; +} + +static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, + struct lowpan_dev *dev) { struct lowpan_peer *peer; unsigned long flags; peer = kzalloc(sizeof(*peer), GFP_ATOMIC); if (!peer) - return -ENOMEM; + return NULL; - peer->conn = conn; + peer->chan = chan; memset(&peer->peer_addr, 0, sizeof(struct in6_addr)); /* RFC 2464 ch. 5 */ peer->peer_addr.s6_addr[0] = 0xFE; peer->peer_addr.s6_addr[1] = 0x80; - set_addr((u8 *)&peer->peer_addr.s6_addr + 8, conn->hcon->dst.b, - conn->hcon->dst_type); + set_addr((u8 *)&peer->peer_addr.s6_addr + 8, chan->dst.b, + chan->dst_type); memcpy(&peer->eui64_addr, (u8 *)&peer->peer_addr.s6_addr + 8, EUI64_ADDR_LEN); + /* IPv6 address needs to have the U/L bit set properly so toggle + * it back here. + */ + set_ip_addr_bits(chan->dst_type, (u8 *)&peer->peer_addr.s6_addr + 8); + write_lock_irqsave(&devices_lock, flags); INIT_LIST_HEAD(&peer->list); peer_add(dev, peer); @@ -706,40 +815,24 @@ static int add_peer_conn(struct l2cap_conn *conn, struct lowpan_dev *dev) INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers); schedule_delayed_work(&dev->notify_peers, msecs_to_jiffies(100)); - return 0; + return peer->chan; } -/* This gets called when BT LE 6LoWPAN device is connected. We then - * create network device that acts as a proxy between BT LE device - * and kernel network stack. - */ -int bt_6lowpan_add_conn(struct l2cap_conn *conn) +static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) { - struct lowpan_peer *peer = NULL; - struct lowpan_dev *dev; struct net_device *netdev; int err = 0; unsigned long flags; - if (!is_bt_6lowpan(conn->hcon)) - return 0; - - peer = lookup_peer(conn); - if (peer) - return -EEXIST; - - dev = lookup_dev(conn); - if (dev) - return add_peer_conn(conn, dev); - - netdev = alloc_netdev(sizeof(*dev), IFACE_NAME_TEMPLATE, netdev_setup); + netdev = alloc_netdev(sizeof(struct lowpan_dev), IFACE_NAME_TEMPLATE, + NET_NAME_UNKNOWN, netdev_setup); if (!netdev) return -ENOMEM; - set_dev_addr(netdev, &conn->hcon->src, conn->hcon->src_type); + set_dev_addr(netdev, &chan->src, chan->src_type); netdev->netdev_ops = &netdev_ops; - SET_NETDEV_DEV(netdev, &conn->hcon->dev); + SET_NETDEV_DEV(netdev, &chan->conn->hcon->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); err = register_netdev(netdev); @@ -749,28 +842,61 @@ int bt_6lowpan_add_conn(struct l2cap_conn *conn) goto out; } - BT_DBG("ifindex %d peer bdaddr %pMR my addr %pMR", - netdev->ifindex, &conn->hcon->dst, &conn->hcon->src); + BT_DBG("ifindex %d peer bdaddr %pMR type %d my addr %pMR type %d", + netdev->ifindex, &chan->dst, chan->dst_type, + &chan->src, chan->src_type); set_bit(__LINK_STATE_PRESENT, &netdev->state); - dev = netdev_priv(netdev); - dev->netdev = netdev; - dev->hdev = conn->hcon->hdev; - INIT_LIST_HEAD(&dev->peers); + *dev = netdev_priv(netdev); + (*dev)->netdev = netdev; + (*dev)->hdev = chan->conn->hcon->hdev; + INIT_LIST_HEAD(&(*dev)->peers); write_lock_irqsave(&devices_lock, flags); - INIT_LIST_HEAD(&dev->list); - list_add(&dev->list, &bt_6lowpan_devices); + INIT_LIST_HEAD(&(*dev)->list); + list_add(&(*dev)->list, &bt_6lowpan_devices); write_unlock_irqrestore(&devices_lock, flags); - ifup(netdev); - - return add_peer_conn(conn, dev); + return 0; out: return err; } +static inline void chan_ready_cb(struct l2cap_chan *chan) +{ + struct lowpan_dev *dev; + + dev = lookup_dev(chan->conn); + + BT_DBG("chan %p conn %p dev %p", chan, chan->conn, dev); + + if (!dev) { + if (setup_netdev(chan, &dev) < 0) { + l2cap_chan_del(chan, -ENOENT); + return; + } + } + + if (!try_module_get(THIS_MODULE)) + return; + + add_peer_chan(chan, dev); + ifup(dev->netdev); +} + +static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan) +{ + struct l2cap_chan *chan; + + chan = chan_open(pchan); + chan->ops = pchan->ops; + + BT_DBG("chan %p pchan %p", chan, pchan); + + return chan; +} + static void delete_netdev(struct work_struct *work) { struct lowpan_dev *entry = container_of(work, struct lowpan_dev, @@ -781,26 +907,43 @@ static void delete_netdev(struct work_struct *work) /* The entry pointer is deleted in device_event() */ } -int bt_6lowpan_del_conn(struct l2cap_conn *conn) +static void chan_close_cb(struct l2cap_chan *chan) { struct lowpan_dev *entry, *tmp; struct lowpan_dev *dev = NULL; struct lowpan_peer *peer; int err = -ENOENT; unsigned long flags; - bool last = false; + bool last = false, removed = true; - if (!conn || !is_bt_6lowpan(conn->hcon)) - return 0; + BT_DBG("chan %p conn %p", chan, chan->conn); + + if (chan->conn && chan->conn->hcon) { + if (!is_bt_6lowpan(chan->conn->hcon)) + return; + + /* If conn is set, then the netdev is also there and we should + * not remove it. + */ + removed = false; + } write_lock_irqsave(&devices_lock, flags); list_for_each_entry_safe(entry, tmp, &bt_6lowpan_devices, list) { dev = lowpan_dev(entry->netdev); - peer = peer_lookup_conn(dev, conn); + peer = peer_lookup_chan(dev, chan); if (peer) { last = peer_del(dev, peer); err = 0; + + BT_DBG("dev %p removing %speer %p", dev, + last ? "last " : "1 ", peer); + BT_DBG("chan %p orig refcnt %d", chan, + atomic_read(&chan->kref.refcount)); + + l2cap_chan_put(chan); + kfree(peer); break; } } @@ -810,18 +953,408 @@ int bt_6lowpan_del_conn(struct l2cap_conn *conn) cancel_delayed_work_sync(&dev->notify_peers); - /* bt_6lowpan_del_conn() is called with hci dev lock held which - * means that we must delete the netdevice in worker thread. - */ - INIT_WORK(&entry->delete_netdev, delete_netdev); - schedule_work(&entry->delete_netdev); + ifdown(dev->netdev); + + if (!removed) { + INIT_WORK(&entry->delete_netdev, delete_netdev); + schedule_work(&entry->delete_netdev); + } } else { write_unlock_irqrestore(&devices_lock, flags); } + return; +} + +static void chan_state_change_cb(struct l2cap_chan *chan, int state, int err) +{ + BT_DBG("chan %p conn %p state %s err %d", chan, chan->conn, + state_to_string(state), err); +} + +static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan, + unsigned long hdr_len, + unsigned long len, int nb) +{ + /* Note that we must allocate using GFP_ATOMIC here as + * this function is called originally from netdev hard xmit + * function in atomic context. + */ + return bt_skb_alloc(hdr_len + len, GFP_ATOMIC); +} + +static void chan_suspend_cb(struct l2cap_chan *chan) +{ + struct sk_buff *skb = chan->data; + + BT_DBG("chan %p conn %p skb %p", chan, chan->conn, skb); + + if (!skb) + return; + + lowpan_cb(skb)->status = -EAGAIN; +} + +static void chan_resume_cb(struct l2cap_chan *chan) +{ + struct sk_buff *skb = chan->data; + + BT_DBG("chan %p conn %p skb %p", chan, chan->conn, skb); + + if (!skb) + return; + + lowpan_cb(skb)->status = 0; +} + +static long chan_get_sndtimeo_cb(struct l2cap_chan *chan) +{ + return L2CAP_CONN_TIMEOUT; +} + +static const struct l2cap_ops bt_6lowpan_chan_ops = { + .name = "L2CAP 6LoWPAN channel", + .new_connection = chan_new_conn_cb, + .recv = chan_recv_cb, + .close = chan_close_cb, + .state_change = chan_state_change_cb, + .ready = chan_ready_cb, + .resume = chan_resume_cb, + .suspend = chan_suspend_cb, + .get_sndtimeo = chan_get_sndtimeo_cb, + .alloc_skb = chan_alloc_skb_cb, + .memcpy_fromiovec = l2cap_chan_no_memcpy_fromiovec, + + .teardown = l2cap_chan_no_teardown, + .defer = l2cap_chan_no_defer, + .set_shutdown = l2cap_chan_no_set_shutdown, +}; + +static inline __u8 bdaddr_type(__u8 type) +{ + if (type == ADDR_LE_DEV_PUBLIC) + return BDADDR_LE_PUBLIC; + else + return BDADDR_LE_RANDOM; +} + +static struct l2cap_chan *chan_get(void) +{ + struct l2cap_chan *pchan; + + pchan = chan_create(); + if (!pchan) + return NULL; + + pchan->ops = &bt_6lowpan_chan_ops; + + return pchan; +} + +static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) +{ + struct l2cap_chan *pchan; + int err; + + pchan = chan_get(); + if (!pchan) + return -EINVAL; + + err = l2cap_chan_connect(pchan, cpu_to_le16(psm_6lowpan), 0, + addr, dst_type); + + BT_DBG("chan %p err %d", pchan, err); + if (err < 0) + l2cap_chan_put(pchan); + return err; } +static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) +{ + struct lowpan_peer *peer; + + BT_DBG("conn %p dst type %d", conn, dst_type); + + peer = lookup_peer(conn); + if (!peer) + return -ENOENT; + + BT_DBG("peer %p chan %p", peer, peer->chan); + + l2cap_chan_close(peer->chan, ENOENT); + + return 0; +} + +static struct l2cap_chan *bt_6lowpan_listen(void) +{ + bdaddr_t *addr = BDADDR_ANY; + struct l2cap_chan *pchan; + int err; + + if (psm_6lowpan == 0) + return NULL; + + pchan = chan_get(); + if (!pchan) + return NULL; + + pchan->state = BT_LISTEN; + pchan->src_type = BDADDR_LE_PUBLIC; + + BT_DBG("psm 0x%04x chan %p src type %d", psm_6lowpan, pchan, + pchan->src_type); + + err = l2cap_add_psm(pchan, addr, cpu_to_le16(psm_6lowpan)); + if (err) { + l2cap_chan_put(pchan); + BT_ERR("psm cannot be added err %d", err); + return NULL; + } + + return pchan; +} + +static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, + struct l2cap_conn **conn) +{ + struct hci_conn *hcon; + struct hci_dev *hdev; + bdaddr_t *src = BDADDR_ANY; + int n; + + n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu", + &addr->b[5], &addr->b[4], &addr->b[3], + &addr->b[2], &addr->b[1], &addr->b[0], + addr_type); + + if (n < 7) + return -EINVAL; + + hdev = hci_get_route(addr, src); + if (!hdev) + return -ENOENT; + + hci_dev_lock(hdev); + hcon = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr); + hci_dev_unlock(hdev); + + if (!hcon) + return -ENOENT; + + *conn = (struct l2cap_conn *)hcon->l2cap_data; + + BT_DBG("conn %p dst %pMR type %d", *conn, &hcon->dst, hcon->dst_type); + + return 0; +} + +static void disconnect_all_peers(void) +{ + struct lowpan_dev *entry, *tmp_dev; + struct lowpan_peer *peer, *tmp_peer, *new_peer; + struct list_head peers; + unsigned long flags; + + INIT_LIST_HEAD(&peers); + + /* We make a separate list of peers as the close_cb() will + * modify the device peers list so it is better not to mess + * with the same list at the same time. + */ + + read_lock_irqsave(&devices_lock, flags); + + list_for_each_entry_safe(entry, tmp_dev, &bt_6lowpan_devices, list) { + list_for_each_entry_safe(peer, tmp_peer, &entry->peers, list) { + new_peer = kmalloc(sizeof(*new_peer), GFP_ATOMIC); + if (!new_peer) + break; + + new_peer->chan = peer->chan; + INIT_LIST_HEAD(&new_peer->list); + + list_add(&new_peer->list, &peers); + } + } + + read_unlock_irqrestore(&devices_lock, flags); + + list_for_each_entry_safe(peer, tmp_peer, &peers, list) { + l2cap_chan_close(peer->chan, ENOENT); + kfree(peer); + } +} + +static int lowpan_psm_set(void *data, u64 val) +{ + u16 psm; + + psm = val; + if (psm == 0 || psm_6lowpan != psm) + /* Disconnect existing connections if 6lowpan is + * disabled (psm = 0), or if psm changes. + */ + disconnect_all_peers(); + + psm_6lowpan = psm; + + if (listen_chan) { + l2cap_chan_close(listen_chan, 0); + l2cap_chan_put(listen_chan); + } + + listen_chan = bt_6lowpan_listen(); + + return 0; +} + +static int lowpan_psm_get(void *data, u64 *val) +{ + *val = psm_6lowpan; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(lowpan_psm_fops, lowpan_psm_get, + lowpan_psm_set, "%llu\n"); + +static ssize_t lowpan_control_write(struct file *fp, + const char __user *user_buffer, + size_t count, + loff_t *position) +{ + char buf[32]; + size_t buf_size = min(count, sizeof(buf) - 1); + int ret; + bdaddr_t addr; + u8 addr_type; + struct l2cap_conn *conn = NULL; + + if (copy_from_user(buf, user_buffer, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + + if (memcmp(buf, "connect ", 8) == 0) { + ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn); + if (ret == -EINVAL) + return ret; + + if (listen_chan) { + l2cap_chan_close(listen_chan, 0); + l2cap_chan_put(listen_chan); + listen_chan = NULL; + } + + if (conn) { + struct lowpan_peer *peer; + + if (!is_bt_6lowpan(conn->hcon)) + return -EINVAL; + + peer = lookup_peer(conn); + if (peer) { + BT_DBG("6LoWPAN connection already exists"); + return -EALREADY; + } + + BT_DBG("conn %p dst %pMR type %d user %d", conn, + &conn->hcon->dst, conn->hcon->dst_type, + addr_type); + } + + ret = bt_6lowpan_connect(&addr, addr_type); + if (ret < 0) + return ret; + + return count; + } + + if (memcmp(buf, "disconnect ", 11) == 0) { + ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn); + if (ret < 0) + return ret; + + ret = bt_6lowpan_disconnect(conn, addr_type); + if (ret < 0) + return ret; + + return count; + } + + return count; +} + +static int lowpan_control_show(struct seq_file *f, void *ptr) +{ + struct lowpan_dev *entry, *tmp_dev; + struct lowpan_peer *peer, *tmp_peer; + unsigned long flags; + + read_lock_irqsave(&devices_lock, flags); + + list_for_each_entry_safe(entry, tmp_dev, &bt_6lowpan_devices, list) { + list_for_each_entry_safe(peer, tmp_peer, &entry->peers, list) + seq_printf(f, "%pMR (type %u)\n", + &peer->chan->dst, peer->chan->dst_type); + } + + read_unlock_irqrestore(&devices_lock, flags); + + return 0; +} + +static int lowpan_control_open(struct inode *inode, struct file *file) +{ + return single_open(file, lowpan_control_show, inode->i_private); +} + +static const struct file_operations lowpan_control_fops = { + .open = lowpan_control_open, + .read = seq_read, + .write = lowpan_control_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static void disconnect_devices(void) +{ + struct lowpan_dev *entry, *tmp, *new_dev; + struct list_head devices; + unsigned long flags; + + INIT_LIST_HEAD(&devices); + + /* We make a separate list of devices because the unregister_netdev() + * will call device_event() which will also want to modify the same + * devices list. + */ + + read_lock_irqsave(&devices_lock, flags); + + list_for_each_entry_safe(entry, tmp, &bt_6lowpan_devices, list) { + new_dev = kmalloc(sizeof(*new_dev), GFP_ATOMIC); + if (!new_dev) + break; + + new_dev->netdev = entry->netdev; + INIT_LIST_HEAD(&new_dev->list); + + list_add(&new_dev->list, &devices); + } + + read_unlock_irqrestore(&devices_lock, flags); + + list_for_each_entry_safe(entry, tmp, &devices, list) { + ifdown(entry->netdev); + BT_DBG("Unregistering netdev %s %p", + entry->netdev->name, entry->netdev); + unregister_netdev(entry->netdev); + kfree(entry); + } +} + static int device_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -838,6 +1371,8 @@ static int device_event(struct notifier_block *unused, list_for_each_entry_safe(entry, tmp, &bt_6lowpan_devices, list) { if (entry->netdev == netdev) { + BT_DBG("Unregistered netdev %s %p", + netdev->name, netdev); list_del(&entry->list); kfree(entry); break; @@ -854,12 +1389,37 @@ static struct notifier_block bt_6lowpan_dev_notifier = { .notifier_call = device_event, }; -int bt_6lowpan_init(void) +static int __init bt_6lowpan_init(void) { + lowpan_psm_debugfs = debugfs_create_file("6lowpan_psm", 0644, + bt_debugfs, NULL, + &lowpan_psm_fops); + lowpan_control_debugfs = debugfs_create_file("6lowpan_control", 0644, + bt_debugfs, NULL, + &lowpan_control_fops); + return register_netdevice_notifier(&bt_6lowpan_dev_notifier); } -void bt_6lowpan_cleanup(void) +static void __exit bt_6lowpan_exit(void) { + debugfs_remove(lowpan_psm_debugfs); + debugfs_remove(lowpan_control_debugfs); + + if (listen_chan) { + l2cap_chan_close(listen_chan, 0); + l2cap_chan_put(listen_chan); + } + + disconnect_devices(); + unregister_netdevice_notifier(&bt_6lowpan_dev_notifier); } + +module_init(bt_6lowpan_init); +module_exit(bt_6lowpan_exit); + +MODULE_AUTHOR("Jukka Rissanen <jukka.rissanen@linux.intel.com>"); +MODULE_DESCRIPTION("Bluetooth 6LoWPAN"); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); diff --git a/net/bluetooth/6lowpan.h b/net/bluetooth/6lowpan.h deleted file mode 100644 index 5d281f1eaf55..000000000000 --- a/net/bluetooth/6lowpan.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - Copyright (c) 2013 Intel Corp. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License version 2 and - only version 2 as published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. -*/ - -#ifndef __6LOWPAN_H -#define __6LOWPAN_H - -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <net/bluetooth/l2cap.h> - -#if IS_ENABLED(CONFIG_BT_6LOWPAN) -int bt_6lowpan_recv(struct l2cap_conn *conn, struct sk_buff *skb); -int bt_6lowpan_add_conn(struct l2cap_conn *conn); -int bt_6lowpan_del_conn(struct l2cap_conn *conn); -int bt_6lowpan_init(void); -void bt_6lowpan_cleanup(void); -#else -static int bt_6lowpan_recv(struct l2cap_conn *conn, struct sk_buff *skb) -{ - return -EOPNOTSUPP; -} -static int bt_6lowpan_add_conn(struct l2cap_conn *conn) -{ - return -EOPNOTSUPP; -} -int bt_6lowpan_del_conn(struct l2cap_conn *conn) -{ - return -EOPNOTSUPP; -} -static int bt_6lowpan_init(void) -{ - return -EOPNOTSUPP; -} -static void bt_6lowpan_cleanup(void) { } -#endif - -#endif /* __6LOWPAN_H */ diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 06ec14499ca1..600fb29288f4 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -6,7 +6,6 @@ menuconfig BT tristate "Bluetooth subsystem support" depends on NET && !S390 depends on RFKILL || !RFKILL - select 6LOWPAN_IPHC if BT_6LOWPAN select CRC16 select CRYPTO select CRYPTO_BLKCIPHER @@ -41,10 +40,10 @@ menuconfig BT more information, see <http://www.bluez.org/>. config BT_6LOWPAN - bool "Bluetooth 6LoWPAN support" - depends on BT && IPV6 + tristate "Bluetooth 6LoWPAN support" + depends on BT && 6LOWPAN help - IPv6 compression over Bluetooth. + IPv6 compression over Bluetooth Low Energy. source "net/bluetooth/rfcomm/Kconfig" diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index ca51246b1016..886e9aa3ecf1 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -7,10 +7,12 @@ obj-$(CONFIG_BT_RFCOMM) += rfcomm/ obj-$(CONFIG_BT_BNEP) += bnep/ obj-$(CONFIG_BT_CMTP) += cmtp/ obj-$(CONFIG_BT_HIDP) += hidp/ +obj-$(CONFIG_BT_6LOWPAN) += bluetooth_6lowpan.o + +bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o sco.o lib.o \ a2mp.o amp.o -bluetooth-$(CONFIG_BT_6LOWPAN) += 6lowpan.o subdir-ccflags-y += -D__CHECK_ENDIAN__ diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 9514cc9e850c..5dcade511fdb 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -63,7 +63,7 @@ void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data) msg.msg_iov = (struct iovec *) &iv; msg.msg_iovlen = 1; - l2cap_chan_send(chan, &msg, total_len, 0); + l2cap_chan_send(chan, &msg, total_len); kfree(cmd); } @@ -693,18 +693,19 @@ static void a2mp_chan_state_change_cb(struct l2cap_chan *chan, int state, } static struct sk_buff *a2mp_chan_alloc_skb_cb(struct l2cap_chan *chan, + unsigned long hdr_len, unsigned long len, int nb) { struct sk_buff *skb; - skb = bt_skb_alloc(len, GFP_KERNEL); + skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); return skb; } -static struct l2cap_ops a2mp_chan_ops = { +static const struct l2cap_ops a2mp_chan_ops = { .name = "L2CAP A2MP channel", .recv = a2mp_chan_recv_cb, .close = a2mp_chan_close_cb, @@ -719,6 +720,7 @@ static struct l2cap_ops a2mp_chan_ops = { .resume = l2cap_chan_no_resume, .set_shutdown = l2cap_chan_no_set_shutdown, .get_sndtimeo = l2cap_chan_no_get_sndtimeo, + .memcpy_fromiovec = l2cap_chan_no_memcpy_fromiovec, }; static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked) diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 2021c481cdb6..339c74ad4553 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -639,7 +639,7 @@ static int bt_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations bt_seq_ops = { +static const struct seq_operations bt_seq_ops = { .start = bt_seq_start, .next = bt_seq_next, .stop = bt_seq_stop, @@ -709,8 +709,11 @@ EXPORT_SYMBOL_GPL(bt_debugfs); static int __init bt_init(void) { + struct sk_buff *skb; int err; + BUILD_BUG_ON(sizeof(struct bt_skb_cb) > sizeof(skb->cb)); + BT_INFO("Core ver %s", VERSION); bt_debugfs = debugfs_create_dir("bluetooth", NULL); diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index bb39509b3f06..2640d78f30b8 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -113,8 +113,9 @@ struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr, { bdaddr_t *dst = &mgr->l2cap_conn->hcon->dst; struct hci_conn *hcon; + u8 role = out ? HCI_ROLE_MASTER : HCI_ROLE_SLAVE; - hcon = hci_conn_add(hdev, AMP_LINK, dst); + hcon = hci_conn_add(hdev, AMP_LINK, dst, role); if (!hcon) return NULL; @@ -125,7 +126,6 @@ struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr, hcon->handle = __next_handle(mgr); hcon->remote_id = remote_id; hcon->amp_mgr = amp_mgr_get(mgr); - hcon->out = out; return hcon; } @@ -133,8 +133,8 @@ struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr, /* AMP crypto key generation interface */ static int hmac_sha256(u8 *key, u8 ksize, char *plaintext, u8 psize, u8 *output) { - int ret = 0; struct crypto_shash *tfm; + int ret; if (!ksize) return -EINVAL; @@ -149,15 +149,14 @@ static int hmac_sha256(u8 *key, u8 ksize, char *plaintext, u8 psize, u8 *output) if (ret) { BT_DBG("crypto_ahash_setkey failed: err %d", ret); } else { - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(tfm)]; - } desc; + char desc[sizeof(struct shash_desc) + + crypto_shash_descsize(tfm)] CRYPTO_MINALIGN_ATTR; + struct shash_desc *shash = (struct shash_desc *)desc; - desc.shash.tfm = tfm; - desc.shash.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + shash->tfm = tfm; + shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - ret = crypto_shash_digest(&desc.shash, plaintext, psize, + ret = crypto_shash_digest(shash, plaintext, psize, output); } diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index a841d3e776c5..85bcc21e84d2 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -538,8 +538,9 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) /* session struct allocated as private part of net_device */ dev = alloc_netdev(sizeof(struct bnep_session), - (*req->device) ? req->device : "bnep%d", - bnep_net_setup); + (*req->device) ? req->device : "bnep%d", + NET_NAME_UNKNOWN, + bnep_net_setup); if (!dev) return -ENOMEM; diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index cd75e4d64b90..1ca8a87a0787 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -362,12 +362,6 @@ void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb) CAPIMSG_SETCONTROL(skb->data, contr); } - if (!ctrl) { - BT_ERR("Can't find controller %d for message", session->num); - kfree_skb(skb); - return; - } - capi_ctr_handle_message(ctrl, appl, skb); } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index a7a27bc2c0b1..b9517bd17190 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -36,19 +36,25 @@ struct sco_param { u16 pkt_type; u16 max_latency; + u8 retrans_effort; +}; + +static const struct sco_param esco_param_cvsd[] = { + { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a, 0x01 }, /* S3 */ + { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007, 0x01 }, /* S2 */ + { EDR_ESCO_MASK | ESCO_EV3, 0x0007, 0x01 }, /* S1 */ + { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0x01 }, /* D1 */ + { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0x01 }, /* D0 */ }; static const struct sco_param sco_param_cvsd[] = { - { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a }, /* S3 */ - { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007 }, /* S2 */ - { EDR_ESCO_MASK | ESCO_EV3, 0x0007 }, /* S1 */ - { EDR_ESCO_MASK | ESCO_HV3, 0xffff }, /* D1 */ - { EDR_ESCO_MASK | ESCO_HV1, 0xffff }, /* D0 */ + { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0xff }, /* D1 */ + { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0xff }, /* D0 */ }; -static const struct sco_param sco_param_wideband[] = { - { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d }, /* T2 */ - { EDR_ESCO_MASK | ESCO_EV3, 0x0008 }, /* T1 */ +static const struct sco_param esco_param_msbc[] = { + { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d, 0x02 }, /* T2 */ + { EDR_ESCO_MASK | ESCO_EV3, 0x0008, 0x02 }, /* T1 */ }; static void hci_le_create_connection_cancel(struct hci_conn *conn) @@ -66,8 +72,7 @@ static void hci_acl_create_connection(struct hci_conn *conn) conn->state = BT_CONNECT; conn->out = true; - - conn->link_mode = HCI_LM_MASTER; + conn->role = HCI_ROLE_MASTER; conn->attempt++; @@ -117,26 +122,39 @@ static void hci_reject_sco(struct hci_conn *conn) { struct hci_cp_reject_sync_conn_req cp; - cp.reason = HCI_ERROR_REMOTE_USER_TERM; + cp.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; bacpy(&cp.bdaddr, &conn->dst); hci_send_cmd(conn->hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp); } -void hci_disconnect(struct hci_conn *conn, __u8 reason) +int hci_disconnect(struct hci_conn *conn, __u8 reason) { struct hci_cp_disconnect cp; BT_DBG("hcon %p", conn); + /* When we are master of an established connection and it enters + * the disconnect timeout, then go ahead and try to read the + * current clock offset. Processing of the result is done + * within the event handling and hci_clock_offset_evt function. + */ + if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER) { + struct hci_dev *hdev = conn->hdev; + struct hci_cp_read_clock_offset cp; + + cp.handle = cpu_to_le16(conn->handle); + hci_send_cmd(hdev, HCI_OP_READ_CLOCK_OFFSET, sizeof(cp), &cp); + } + conn->state = BT_DISCONN; cp.handle = cpu_to_le16(conn->handle); cp.reason = reason; - hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp); + return hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp); } -static void hci_amp_disconn(struct hci_conn *conn, __u8 reason) +static void hci_amp_disconn(struct hci_conn *conn) { struct hci_cp_disconn_phy_link cp; @@ -145,7 +163,7 @@ static void hci_amp_disconn(struct hci_conn *conn, __u8 reason) conn->state = BT_DISCONN; cp.phy_handle = HCI_PHY_HANDLE(conn->handle); - cp.reason = reason; + cp.reason = hci_proto_disconn_ind(conn); hci_send_cmd(conn->hdev, HCI_OP_DISCONN_PHY_LINK, sizeof(cp), &cp); } @@ -189,21 +207,26 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: - if (conn->attempt > ARRAY_SIZE(sco_param_wideband)) + if (conn->attempt > ARRAY_SIZE(esco_param_msbc)) return false; - cp.retrans_effort = 0x02; - param = &sco_param_wideband[conn->attempt - 1]; + param = &esco_param_msbc[conn->attempt - 1]; break; case SCO_AIRMODE_CVSD: - if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) - return false; - cp.retrans_effort = 0x01; - param = &sco_param_cvsd[conn->attempt - 1]; + if (lmp_esco_capable(conn->link)) { + if (conn->attempt > ARRAY_SIZE(esco_param_cvsd)) + return false; + param = &esco_param_cvsd[conn->attempt - 1]; + } else { + if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) + return false; + param = &sco_param_cvsd[conn->attempt - 1]; + } break; default: return false; } + cp.retrans_effort = param->retrans_effort; cp.pkt_type = __cpu_to_le16(param->pkt_type); cp.max_latency = __cpu_to_le16(param->max_latency); @@ -213,14 +236,26 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) return true; } -void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, - u16 latency, u16 to_multiplier) +u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, + u16 to_multiplier) { - struct hci_cp_le_conn_update cp; struct hci_dev *hdev = conn->hdev; + struct hci_conn_params *params; + struct hci_cp_le_conn_update cp; - memset(&cp, 0, sizeof(cp)); + hci_dev_lock(hdev); + + params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); + if (params) { + params->conn_min_interval = min; + params->conn_max_interval = max; + params->conn_latency = latency; + params->supervision_timeout = to_multiplier; + } + + hci_dev_unlock(hdev); + memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.conn_interval_min = cpu_to_le16(min); cp.conn_interval_max = cpu_to_le16(max); @@ -230,6 +265,11 @@ void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, cp.max_ce_len = cpu_to_le16(0x0000); hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp); + + if (params) + return 0x01; + + return 0x00; } void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, @@ -271,20 +311,6 @@ void hci_sco_setup(struct hci_conn *conn, __u8 status) } } -static void hci_conn_disconnect(struct hci_conn *conn) -{ - __u8 reason = hci_proto_disconn_ind(conn); - - switch (conn->type) { - case AMP_LINK: - hci_amp_disconn(conn, reason); - break; - default: - hci_disconnect(conn, reason); - break; - } -} - static void hci_conn_timeout(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, @@ -319,7 +345,12 @@ static void hci_conn_timeout(struct work_struct *work) break; case BT_CONFIG: case BT_CONNECTED: - hci_conn_disconnect(conn); + if (conn->type == AMP_LINK) { + hci_amp_disconn(conn); + } else { + __u8 reason = hci_proto_disconn_ind(conn); + hci_disconnect(conn, reason); + } break; default: conn->state = BT_CLOSED; @@ -336,9 +367,6 @@ static void hci_conn_idle(struct work_struct *work) BT_DBG("hcon %p mode %d", conn, conn->mode); - if (test_bit(HCI_RAW, &hdev->flags)) - return; - if (!lmp_sniff_capable(hdev) || !lmp_sniff_capable(conn)) return; @@ -398,13 +426,14 @@ static void le_conn_timeout(struct work_struct *work) hci_le_create_connection_cancel(conn); } -struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) +struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, + u8 role) { struct hci_conn *conn; BT_DBG("%s dst %pMR", hdev->name, dst); - conn = kzalloc(sizeof(struct hci_conn), GFP_KERNEL); + conn = kzalloc(sizeof(*conn), GFP_KERNEL); if (!conn) return NULL; @@ -412,6 +441,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) bacpy(&conn->src, &hdev->bdaddr); conn->hdev = hdev; conn->type = type; + conn->role = role; conn->mode = HCI_CM_ACTIVE; conn->state = BT_OPEN; conn->auth_type = HCI_AT_GENERAL_BONDING; @@ -424,6 +454,9 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) set_bit(HCI_CONN_POWER_SAVE, &conn->flags); conn->disc_timeout = HCI_DISCONN_TIMEOUT; + if (conn->role == HCI_ROLE_MASTER) + conn->out = true; + switch (type) { case ACL_LINK: conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK; @@ -529,7 +562,6 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src) list_for_each_entry(d, &hci_dev_list, list) { if (!test_bit(HCI_UP, &d->flags) || - test_bit(HCI_RAW, &d->flags) || test_bit(HCI_USER_CHANNEL, &d->dev_flags) || d->dev_type != HCI_BREDR) continue; @@ -562,6 +594,15 @@ EXPORT_SYMBOL(hci_get_route); void hci_le_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; + struct hci_conn_params *params; + + params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, + conn->dst_type); + if (params && params->conn) { + hci_conn_drop(params->conn); + hci_conn_put(params->conn); + params->conn = NULL; + } conn->state = BT_CLOSED; @@ -627,7 +668,8 @@ static void hci_req_add_le_create_conn(struct hci_request *req, cp.own_address_type = own_addr_type; cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); - cp.supervision_timeout = cpu_to_le16(0x002a); + cp.conn_latency = cpu_to_le16(conn->le_conn_latency); + cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); @@ -644,15 +686,12 @@ static void hci_req_directed_advertising(struct hci_request *req, u8 own_addr_type; u8 enable; - enable = 0x00; - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); - - /* Clear the HCI_ADVERTISING bit temporarily so that the + /* Clear the HCI_LE_ADV bit temporarily so that the * hci_update_random_address knows that it's safe to go ahead * and write a new random address. The flag will be set back on * as soon as the SET_ADV_ENABLE HCI command completes. */ - clear_bit(HCI_ADVERTISING, &hdev->dev_flags); + clear_bit(HCI_LE_ADV, &hdev->dev_flags); /* Set require_privacy to false so that the remote device has a * chance of identifying us. @@ -676,7 +715,8 @@ static void hci_req_directed_advertising(struct hci_request *req, } struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, - u8 dst_type, u8 sec_level, u8 auth_type) + u8 dst_type, u8 sec_level, u16 conn_timeout, + u8 role) { struct hci_conn_params *params; struct hci_conn *conn; @@ -696,7 +736,6 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, dst); if (conn) { conn->pending_sec_level = sec_level; - conn->auth_type = auth_type; goto done; } @@ -726,32 +765,56 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, dst_type = ADDR_LE_DEV_RANDOM; } - conn = hci_conn_add(hdev, LE_LINK, dst); + conn = hci_conn_add(hdev, LE_LINK, dst, role); if (!conn) return ERR_PTR(-ENOMEM); conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->pending_sec_level = sec_level; - conn->auth_type = auth_type; + conn->conn_timeout = conn_timeout; hci_req_init(&req, hdev); - if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) { + /* Disable advertising if we're active. For master role + * connections most controllers will refuse to connect if + * advertising is enabled, and for slave role connections we + * anyway have to disable it in order to start directed + * advertising. + */ + if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) { + u8 enable = 0x00; + hci_req_add(&req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), + &enable); + } + + /* If requested to connect as slave use directed advertising */ + if (conn->role == HCI_ROLE_SLAVE) { + /* If we're active scanning most controllers are unable + * to initiate advertising. Simply reject the attempt. + */ + if (test_bit(HCI_LE_SCAN, &hdev->dev_flags) && + hdev->le_scan_type == LE_SCAN_ACTIVE) { + skb_queue_purge(&req.cmd_q); + hci_conn_del(conn); + return ERR_PTR(-EBUSY); + } + hci_req_directed_advertising(&req, conn); goto create_conn; } - conn->out = true; - conn->link_mode |= HCI_LM_MASTER; - params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); if (params) { conn->le_conn_min_interval = params->conn_min_interval; conn->le_conn_max_interval = params->conn_max_interval; + conn->le_conn_latency = params->conn_latency; + conn->le_supv_timeout = params->supervision_timeout; } else { conn->le_conn_min_interval = hdev->le_conn_min_interval; conn->le_conn_max_interval = hdev->le_conn_max_interval; + conn->le_conn_latency = hdev->le_conn_latency; + conn->le_supv_timeout = hdev->le_supv_timeout; } /* If controller is scanning, we stop it since some controllers are @@ -785,11 +848,11 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, struct hci_conn *acl; if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); if (!acl) { - acl = hci_conn_add(hdev, ACL_LINK, dst); + acl = hci_conn_add(hdev, ACL_LINK, dst, HCI_ROLE_MASTER); if (!acl) return ERR_PTR(-ENOMEM); } @@ -818,7 +881,7 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, sco = hci_conn_hash_lookup_ba(hdev, type, dst); if (!sco) { - sco = hci_conn_add(hdev, type, dst); + sco = hci_conn_add(hdev, type, dst, HCI_ROLE_MASTER); if (!sco) { hci_conn_drop(acl); return ERR_PTR(-ENOMEM); @@ -865,7 +928,8 @@ int hci_conn_check_link_mode(struct hci_conn *conn) return 0; } - if (hci_conn_ssp_enabled(conn) && !(conn->link_mode & HCI_LM_ENCRYPT)) + if (hci_conn_ssp_enabled(conn) && + !test_bit(HCI_CONN_ENCRYPT, &conn->flags)) return 0; return 1; @@ -881,7 +945,7 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) if (sec_level > conn->sec_level) conn->pending_sec_level = sec_level; - else if (conn->link_mode & HCI_LM_AUTH) + else if (test_bit(HCI_CONN_AUTH, &conn->flags)) return 1; /* Make sure we preserve an existing MITM requirement*/ @@ -899,7 +963,7 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) /* If we're already encrypted set the REAUTH_PEND flag, * otherwise set the ENCRYPT_PEND. */ - if (conn->link_mode & HCI_LM_ENCRYPT) + if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) set_bit(HCI_CONN_REAUTH_PEND, &conn->flags); else set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); @@ -923,7 +987,8 @@ static void hci_conn_encrypt(struct hci_conn *conn) } /* Enable security */ -int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) +int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, + bool initiator) { BT_DBG("hcon %p", conn); @@ -940,7 +1005,7 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) return 1; /* For other security levels we need the link key. */ - if (!(conn->link_mode & HCI_LM_AUTH)) + if (!test_bit(HCI_CONN_AUTH, &conn->flags)) goto auth; /* An authenticated FIPS approved combination key has sufficient @@ -976,11 +1041,14 @@ auth: if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) return 0; + if (initiator) + set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags); + if (!hci_conn_auth(conn, sec_level, auth_type)) return 0; encrypt: - if (conn->link_mode & HCI_LM_ENCRYPT) + if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) return 1; hci_conn_encrypt(conn); @@ -1027,7 +1095,7 @@ int hci_conn_switch_role(struct hci_conn *conn, __u8 role) { BT_DBG("hcon %p", conn); - if (!role && conn->link_mode & HCI_LM_MASTER) + if (role == conn->role) return 1; if (!test_and_set_bit(HCI_CONN_RSWITCH_PEND, &conn->flags)) { @@ -1048,9 +1116,6 @@ void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active) BT_DBG("hcon %p mode %d", conn, conn->mode); - if (test_bit(HCI_RAW, &hdev->flags)) - return; - if (conn->mode != HCI_CM_SNIFF) goto timer; @@ -1101,6 +1166,28 @@ void hci_conn_check_pending(struct hci_dev *hdev) hci_dev_unlock(hdev); } +static u32 get_link_mode(struct hci_conn *conn) +{ + u32 link_mode = 0; + + if (conn->role == HCI_ROLE_MASTER) + link_mode |= HCI_LM_MASTER; + + if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) + link_mode |= HCI_LM_ENCRYPT; + + if (test_bit(HCI_CONN_AUTH, &conn->flags)) + link_mode |= HCI_LM_AUTH; + + if (test_bit(HCI_CONN_SECURE, &conn->flags)) + link_mode |= HCI_LM_SECURE; + + if (test_bit(HCI_CONN_FIPS, &conn->flags)) + link_mode |= HCI_LM_FIPS; + + return link_mode; +} + int hci_get_conn_list(void __user *arg) { struct hci_conn *c; @@ -1136,7 +1223,7 @@ int hci_get_conn_list(void __user *arg) (ci + n)->type = c->type; (ci + n)->out = c->out; (ci + n)->state = c->state; - (ci + n)->link_mode = c->link_mode; + (ci + n)->link_mode = get_link_mode(c); if (++n >= req.conn_num) break; } @@ -1172,7 +1259,7 @@ int hci_get_conn_info(struct hci_dev *hdev, void __user *arg) ci.type = conn->type; ci.out = conn->out; ci.state = conn->state; - ci.link_mode = conn->link_mode; + ci.link_mode = get_link_mode(conn); } hci_dev_unlock(hdev); @@ -1209,11 +1296,16 @@ struct hci_chan *hci_chan_create(struct hci_conn *conn) BT_DBG("%s hcon %p", hdev->name, conn); - chan = kzalloc(sizeof(struct hci_chan), GFP_KERNEL); + if (test_bit(HCI_CONN_DROP, &conn->flags)) { + BT_DBG("Refusing to create new hci_chan"); + return NULL; + } + + chan = kzalloc(sizeof(*chan), GFP_KERNEL); if (!chan) return NULL; - chan->conn = conn; + chan->conn = hci_conn_get(conn); skb_queue_head_init(&chan->data_q); chan->state = BT_CONNECTED; @@ -1233,7 +1325,10 @@ void hci_chan_del(struct hci_chan *chan) synchronize_rcu(); - hci_conn_drop(conn); + /* Prevent new hci_chan's to be created for this hci_conn */ + set_bit(HCI_CONN_DROP, &conn->flags); + + hci_conn_put(conn); skb_queue_purge(&chan->data_q); kfree(chan); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 0a43cce9a914..cb05d7f16a34 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -35,6 +35,7 @@ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> +#include <net/bluetooth/mgmt.h> #include "smp.h" @@ -53,6 +54,15 @@ DEFINE_RWLOCK(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); +/* ----- HCI requests ----- */ + +#define HCI_REQ_DONE 0 +#define HCI_REQ_PEND 1 +#define HCI_REQ_CANCELED 2 + +#define hci_req_lock(d) mutex_lock(&d->req_lock) +#define hci_req_unlock(d) mutex_unlock(&d->req_lock) + /* ---- HCI notifications ---- */ static void hci_notify(struct hci_dev *hdev, int event) @@ -68,7 +78,7 @@ static ssize_t dut_mode_read(struct file *file, char __user *user_buf, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = test_bit(HCI_DUT_MODE, &hdev->dev_flags) ? 'Y': 'N'; + buf[0] = test_bit(HCI_DUT_MODE, &hdev->dbg_flags) ? 'Y': 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -94,7 +104,7 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, if (strtobool(buf, &enable)) return -EINVAL; - if (enable == test_bit(HCI_DUT_MODE, &hdev->dev_flags)) + if (enable == test_bit(HCI_DUT_MODE, &hdev->dbg_flags)) return -EALREADY; hci_req_lock(hdev); @@ -115,7 +125,7 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, if (err < 0) return err; - change_bit(HCI_DUT_MODE, &hdev->dev_flags); + change_bit(HCI_DUT_MODE, &hdev->dbg_flags); return count; } @@ -190,6 +200,31 @@ static const struct file_operations blacklist_fops = { .release = single_release, }; +static int whitelist_show(struct seq_file *f, void *p) +{ + struct hci_dev *hdev = f->private; + struct bdaddr_list *b; + + hci_dev_lock(hdev); + list_for_each_entry(b, &hdev->whitelist, list) + seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type); + hci_dev_unlock(hdev); + + return 0; +} + +static int whitelist_open(struct inode *inode, struct file *file) +{ + return single_open(file, whitelist_show, inode->i_private); +} + +static const struct file_operations whitelist_fops = { + .open = whitelist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int uuids_show(struct seq_file *f, void *p) { struct hci_dev *hdev = f->private; @@ -352,62 +387,13 @@ static int auto_accept_delay_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(auto_accept_delay_fops, auto_accept_delay_get, auto_accept_delay_set, "%llu\n"); -static int ssp_debug_mode_set(void *data, u64 val) -{ - struct hci_dev *hdev = data; - struct sk_buff *skb; - __u8 mode; - int err; - - if (val != 0 && val != 1) - return -EINVAL; - - if (!test_bit(HCI_UP, &hdev->flags)) - return -ENETDOWN; - - hci_req_lock(hdev); - mode = val; - skb = __hci_cmd_sync(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE, sizeof(mode), - &mode, HCI_CMD_TIMEOUT); - hci_req_unlock(hdev); - - if (IS_ERR(skb)) - return PTR_ERR(skb); - - err = -bt_to_errno(skb->data[0]); - kfree_skb(skb); - - if (err < 0) - return err; - - hci_dev_lock(hdev); - hdev->ssp_debug_mode = val; - hci_dev_unlock(hdev); - - return 0; -} - -static int ssp_debug_mode_get(void *data, u64 *val) -{ - struct hci_dev *hdev = data; - - hci_dev_lock(hdev); - *val = hdev->ssp_debug_mode; - hci_dev_unlock(hdev); - - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(ssp_debug_mode_fops, ssp_debug_mode_get, - ssp_debug_mode_set, "%llu\n"); - static ssize_t force_sc_support_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = test_bit(HCI_FORCE_SC, &hdev->dev_flags) ? 'Y': 'N'; + buf[0] = test_bit(HCI_FORCE_SC, &hdev->dbg_flags) ? 'Y': 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -432,10 +418,10 @@ static ssize_t force_sc_support_write(struct file *file, if (strtobool(buf, &enable)) return -EINVAL; - if (enable == test_bit(HCI_FORCE_SC, &hdev->dev_flags)) + if (enable == test_bit(HCI_FORCE_SC, &hdev->dbg_flags)) return -EALREADY; - change_bit(HCI_FORCE_SC, &hdev->dev_flags); + change_bit(HCI_FORCE_SC, &hdev->dbg_flags); return count; } @@ -719,7 +705,7 @@ static ssize_t force_static_address_read(struct file *file, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dev_flags) ? 'Y': 'N'; + buf[0] = test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dbg_flags) ? 'Y': 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -744,10 +730,10 @@ static ssize_t force_static_address_write(struct file *file, if (strtobool(buf, &enable)) return -EINVAL; - if (enable == test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dev_flags)) + if (enable == test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dbg_flags)) return -EALREADY; - change_bit(HCI_FORCE_STATIC_ADDR, &hdev->dev_flags); + change_bit(HCI_FORCE_STATIC_ADDR, &hdev->dbg_flags); return count; } @@ -900,177 +886,169 @@ static int conn_max_interval_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(conn_max_interval_fops, conn_max_interval_get, conn_max_interval_set, "%llu\n"); -static int adv_channel_map_set(void *data, u64 val) +static int conn_latency_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x01 || val > 0x07) + if (val > 0x01f3) return -EINVAL; hci_dev_lock(hdev); - hdev->le_adv_channel_map = val; + hdev->le_conn_latency = val; hci_dev_unlock(hdev); return 0; } -static int adv_channel_map_get(void *data, u64 *val) +static int conn_latency_get(void *data, u64 *val) { struct hci_dev *hdev = data; hci_dev_lock(hdev); - *val = hdev->le_adv_channel_map; + *val = hdev->le_conn_latency; hci_dev_unlock(hdev); return 0; } -DEFINE_SIMPLE_ATTRIBUTE(adv_channel_map_fops, adv_channel_map_get, - adv_channel_map_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(conn_latency_fops, conn_latency_get, + conn_latency_set, "%llu\n"); -static ssize_t lowpan_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +static int supervision_timeout_set(void *data, u64 val) { - struct hci_dev *hdev = file->private_data; - char buf[3]; + struct hci_dev *hdev = data; - buf[0] = test_bit(HCI_6LOWPAN_ENABLED, &hdev->dev_flags) ? 'Y' : 'N'; - buf[1] = '\n'; - buf[2] = '\0'; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); + if (val < 0x000a || val > 0x0c80) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->le_supv_timeout = val; + hci_dev_unlock(hdev); + + return 0; } -static ssize_t lowpan_write(struct file *fp, const char __user *user_buffer, - size_t count, loff_t *position) +static int supervision_timeout_get(void *data, u64 *val) { - struct hci_dev *hdev = fp->private_data; - bool enable; - char buf[32]; - size_t buf_size = min(count, (sizeof(buf)-1)); + struct hci_dev *hdev = data; - if (copy_from_user(buf, user_buffer, buf_size)) - return -EFAULT; + hci_dev_lock(hdev); + *val = hdev->le_supv_timeout; + hci_dev_unlock(hdev); - buf[buf_size] = '\0'; + return 0; +} - if (strtobool(buf, &enable) < 0) - return -EINVAL; +DEFINE_SIMPLE_ATTRIBUTE(supervision_timeout_fops, supervision_timeout_get, + supervision_timeout_set, "%llu\n"); - if (enable == test_bit(HCI_6LOWPAN_ENABLED, &hdev->dev_flags)) - return -EALREADY; +static int adv_channel_map_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; - change_bit(HCI_6LOWPAN_ENABLED, &hdev->dev_flags); + if (val < 0x01 || val > 0x07) + return -EINVAL; - return count; -} + hci_dev_lock(hdev); + hdev->le_adv_channel_map = val; + hci_dev_unlock(hdev); -static const struct file_operations lowpan_debugfs_fops = { - .open = simple_open, - .read = lowpan_read, - .write = lowpan_write, - .llseek = default_llseek, -}; + return 0; +} -static int le_auto_conn_show(struct seq_file *sf, void *ptr) +static int adv_channel_map_get(void *data, u64 *val) { - struct hci_dev *hdev = sf->private; - struct hci_conn_params *p; + struct hci_dev *hdev = data; hci_dev_lock(hdev); + *val = hdev->le_adv_channel_map; + hci_dev_unlock(hdev); - list_for_each_entry(p, &hdev->le_conn_params, list) { - seq_printf(sf, "%pMR %u %u\n", &p->addr, p->addr_type, - p->auto_connect); - } + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(adv_channel_map_fops, adv_channel_map_get, + adv_channel_map_set, "%llu\n"); + +static int adv_min_interval_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->le_adv_min_interval = val; hci_dev_unlock(hdev); return 0; } -static int le_auto_conn_open(struct inode *inode, struct file *file) +static int adv_min_interval_get(void *data, u64 *val) { - return single_open(file, le_auto_conn_show, inode->i_private); + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->le_adv_min_interval; + hci_dev_unlock(hdev); + + return 0; } -static ssize_t le_auto_conn_write(struct file *file, const char __user *data, - size_t count, loff_t *offset) +DEFINE_SIMPLE_ATTRIBUTE(adv_min_interval_fops, adv_min_interval_get, + adv_min_interval_set, "%llu\n"); + +static int adv_max_interval_set(void *data, u64 val) { - struct seq_file *sf = file->private_data; - struct hci_dev *hdev = sf->private; - u8 auto_connect = 0; - bdaddr_t addr; - u8 addr_type; - char *buf; - int err = 0; - int n; + struct hci_dev *hdev = data; - /* Don't allow partial write */ - if (*offset != 0) + if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) return -EINVAL; - if (count < 3) - return -EINVAL; + hci_dev_lock(hdev); + hdev->le_adv_max_interval = val; + hci_dev_unlock(hdev); - buf = memdup_user(data, count); - if (IS_ERR(buf)) - return PTR_ERR(buf); + return 0; +} - if (memcmp(buf, "add", 3) == 0) { - n = sscanf(&buf[4], "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu %hhu", - &addr.b[5], &addr.b[4], &addr.b[3], &addr.b[2], - &addr.b[1], &addr.b[0], &addr_type, - &auto_connect); +static int adv_max_interval_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; - if (n < 7) { - err = -EINVAL; - goto done; - } + hci_dev_lock(hdev); + *val = hdev->le_adv_max_interval; + hci_dev_unlock(hdev); - hci_dev_lock(hdev); - err = hci_conn_params_add(hdev, &addr, addr_type, auto_connect, - hdev->le_conn_min_interval, - hdev->le_conn_max_interval); - hci_dev_unlock(hdev); + return 0; +} - if (err) - goto done; - } else if (memcmp(buf, "del", 3) == 0) { - n = sscanf(&buf[4], "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu", - &addr.b[5], &addr.b[4], &addr.b[3], &addr.b[2], - &addr.b[1], &addr.b[0], &addr_type); +DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get, + adv_max_interval_set, "%llu\n"); - if (n < 7) { - err = -EINVAL; - goto done; - } +static int device_list_show(struct seq_file *f, void *ptr) +{ + struct hci_dev *hdev = f->private; + struct hci_conn_params *p; - hci_dev_lock(hdev); - hci_conn_params_del(hdev, &addr, addr_type); - hci_dev_unlock(hdev); - } else if (memcmp(buf, "clr", 3) == 0) { - hci_dev_lock(hdev); - hci_conn_params_clear(hdev); - hci_pend_le_conns_clear(hdev); - hci_update_background_scan(hdev); - hci_dev_unlock(hdev); - } else { - err = -EINVAL; + hci_dev_lock(hdev); + list_for_each_entry(p, &hdev->le_conn_params, list) { + seq_printf(f, "%pMR %u %u\n", &p->addr, p->addr_type, + p->auto_connect); } + hci_dev_unlock(hdev); -done: - kfree(buf); + return 0; +} - if (err) - return err; - else - return count; +static int device_list_open(struct inode *inode, struct file *file) +{ + return single_open(file, device_list_show, inode->i_private); } -static const struct file_operations le_auto_conn_fops = { - .open = le_auto_conn_open, +static const struct file_operations device_list_fops = { + .open = device_list_open, .read = seq_read, - .write = le_auto_conn_write, .llseek = seq_lseek, .release = single_release, }; @@ -1426,9 +1404,6 @@ static void le_setup(struct hci_request *req) /* Read LE Supported States */ hci_req_add(req, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL); - /* Read LE Advertising Channel TX Power */ - hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL); - /* Read LE White List Size */ hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, 0, NULL); @@ -1503,14 +1478,17 @@ static void hci_setup_event_mask(struct hci_request *req) /* Use a different default for LE-only devices */ memset(events, 0, sizeof(events)); events[0] |= 0x10; /* Disconnection Complete */ - events[0] |= 0x80; /* Encryption Change */ events[1] |= 0x08; /* Read Remote Version Information Complete */ events[1] |= 0x20; /* Command Complete */ events[1] |= 0x40; /* Command Status */ events[1] |= 0x80; /* Hardware Error */ events[2] |= 0x04; /* Number of Completed Packets */ events[3] |= 0x02; /* Data Buffer Overflow */ - events[5] |= 0x80; /* Encryption Key Refresh Complete */ + + if (hdev->le_features[0] & HCI_LE_ENCRYPTION) { + events[0] |= 0x80; /* Encryption Change */ + events[5] |= 0x80; /* Encryption Key Refresh Complete */ + } } if (lmp_inq_rssi_capable(hdev)) @@ -1549,13 +1527,6 @@ static void hci_setup_event_mask(struct hci_request *req) events[7] |= 0x20; /* LE Meta-Event */ hci_req_add(req, HCI_OP_SET_EVENT_MASK, sizeof(events), events); - - if (lmp_le_capable(hdev)) { - memset(events, 0, sizeof(events)); - events[0] = 0x1f; - hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, - sizeof(events), events); - } } static void hci_init2_req(struct hci_request *req, unsigned long opt) @@ -1570,8 +1541,6 @@ static void hci_init2_req(struct hci_request *req, unsigned long opt) if (lmp_le_capable(hdev)) le_setup(req); - hci_setup_event_mask(req); - /* AVM Berlin (31), aka "BlueFRITZ!", doesn't support the read * local supported commands HCI command. */ @@ -1654,7 +1623,7 @@ static void hci_set_le_support(struct hci_request *req) if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) { cp.le = 0x01; - cp.simul = lmp_le_br_capable(hdev); + cp.simul = 0x00; } if (cp.le != lmp_host_le_capable(hdev)) @@ -1688,7 +1657,7 @@ static void hci_set_event_mask_page_2(struct hci_request *req) } /* Enable Authenticated Payload Timeout Expired event if supported */ - if (lmp_ping_capable(hdev)) + if (lmp_ping_capable(hdev) || hdev->le_features[0] & HCI_LE_PING) events[2] |= 0x80; hci_req_add(req, HCI_OP_SET_EVENT_MASK_PAGE_2, sizeof(events), events); @@ -1699,6 +1668,8 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) struct hci_dev *hdev = req->hdev; u8 p; + hci_setup_event_mask(req); + /* Some Broadcom based Bluetooth controllers do not support the * Delete Stored Link Key command. They are clearly indicating its * absence in the bit mask of supported commands. @@ -1725,8 +1696,33 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) if (hdev->commands[5] & 0x10) hci_setup_link_policy(req); - if (lmp_le_capable(hdev)) + if (lmp_le_capable(hdev)) { + u8 events[8]; + + memset(events, 0, sizeof(events)); + events[0] = 0x0f; + + if (hdev->le_features[0] & HCI_LE_ENCRYPTION) + events[0] |= 0x10; /* LE Long Term Key Request */ + + /* If controller supports the Connection Parameters Request + * Link Layer Procedure, enable the corresponding event. + */ + if (hdev->le_features[0] & HCI_LE_CONN_PARAM_REQ_PROC) + events[0] |= 0x20; /* LE Remote Connection + * Parameter Request + */ + + hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), + events); + + if (hdev->commands[25] & 0x40) { + /* Read LE Advertising Channel TX Power */ + hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL); + } + hci_set_le_support(req); + } /* Read features beyond page 1 if available */ for (p = 2; p < HCI_MAX_PAGES && p <= hdev->max_page; p++) { @@ -1746,13 +1742,21 @@ static void hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->commands[22] & 0x04) hci_set_event_mask_page_2(req); + /* Read local codec list if the HCI command is supported */ + if (hdev->commands[29] & 0x20) + hci_req_add(req, HCI_OP_READ_LOCAL_CODECS, 0, NULL); + + /* Get MWS transport configuration if the HCI command is supported */ + if (hdev->commands[30] & 0x08) + hci_req_add(req, HCI_OP_GET_MWS_TRANSPORT_CONFIG, 0, NULL); + /* Check for Synchronization Train support */ if (lmp_sync_train_capable(hdev)) hci_req_add(req, HCI_OP_READ_SYNC_TRAIN_PARAMS, 0, NULL); /* Enable Secure Connections if supported and configured */ if ((lmp_sc_capable(hdev) || - test_bit(HCI_FORCE_SC, &hdev->dev_flags)) && + test_bit(HCI_FORCE_SC, &hdev->dbg_flags)) && test_bit(HCI_SC_ENABLED, &hdev->dev_flags)) { u8 support = 0x01; hci_req_add(req, HCI_OP_WRITE_SC_SUPPORT, @@ -1809,6 +1813,8 @@ static int __hci_init(struct hci_dev *hdev) debugfs_create_u16("hci_revision", 0444, hdev->debugfs, &hdev->hci_rev); debugfs_create_file("blacklist", 0444, hdev->debugfs, hdev, &blacklist_fops); + debugfs_create_file("whitelist", 0444, hdev->debugfs, hdev, + &whitelist_fops); debugfs_create_file("uuids", 0444, hdev->debugfs, hdev, &uuids_fops); debugfs_create_file("conn_info_min_age", 0644, hdev->debugfs, hdev, @@ -1830,8 +1836,6 @@ static int __hci_init(struct hci_dev *hdev) if (lmp_ssp_capable(hdev)) { debugfs_create_file("auto_accept_delay", 0644, hdev->debugfs, hdev, &auto_accept_delay_fops); - debugfs_create_file("ssp_debug_mode", 0644, hdev->debugfs, - hdev, &ssp_debug_mode_fops); debugfs_create_file("force_sc_support", 0644, hdev->debugfs, hdev, &force_sc_support_fops); debugfs_create_file("sc_only_mode", 0444, hdev->debugfs, @@ -1879,20 +1883,60 @@ static int __hci_init(struct hci_dev *hdev) hdev, &conn_min_interval_fops); debugfs_create_file("conn_max_interval", 0644, hdev->debugfs, hdev, &conn_max_interval_fops); + debugfs_create_file("conn_latency", 0644, hdev->debugfs, + hdev, &conn_latency_fops); + debugfs_create_file("supervision_timeout", 0644, hdev->debugfs, + hdev, &supervision_timeout_fops); debugfs_create_file("adv_channel_map", 0644, hdev->debugfs, hdev, &adv_channel_map_fops); - debugfs_create_file("6lowpan", 0644, hdev->debugfs, hdev, - &lowpan_debugfs_fops); - debugfs_create_file("le_auto_conn", 0644, hdev->debugfs, hdev, - &le_auto_conn_fops); + debugfs_create_file("adv_min_interval", 0644, hdev->debugfs, + hdev, &adv_min_interval_fops); + debugfs_create_file("adv_max_interval", 0644, hdev->debugfs, + hdev, &adv_max_interval_fops); + debugfs_create_file("device_list", 0444, hdev->debugfs, hdev, + &device_list_fops); debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs, &hdev->discov_interleaved_timeout); + + smp_register(hdev); } return 0; } +static void hci_init0_req(struct hci_request *req, unsigned long opt) +{ + struct hci_dev *hdev = req->hdev; + + BT_DBG("%s %ld", hdev->name, opt); + + /* Reset */ + if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) + hci_reset_req(req, 0); + + /* Read Local Version */ + hci_req_add(req, HCI_OP_READ_LOCAL_VERSION, 0, NULL); + + /* Read BD Address */ + if (hdev->set_bdaddr) + hci_req_add(req, HCI_OP_READ_BD_ADDR, 0, NULL); +} + +static int __hci_unconf_init(struct hci_dev *hdev) +{ + int err; + + if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + return 0; + + err = __hci_req_sync(hdev, hci_init0_req, 0, HCI_INIT_TIMEOUT); + if (err < 0) + return err; + + return 0; +} + static void hci_scan_req(struct hci_request *req, unsigned long opt) { __u8 scan = opt; @@ -1973,16 +2017,20 @@ bool hci_discovery_active(struct hci_dev *hdev) void hci_discovery_set_state(struct hci_dev *hdev, int state) { + int old_state = hdev->discovery.state; + BT_DBG("%s state %u -> %u", hdev->name, hdev->discovery.state, state); - if (hdev->discovery.state == state) + if (old_state == state) return; + hdev->discovery.state = state; + switch (state) { case DISCOVERY_STOPPED: hci_update_background_scan(hdev); - if (hdev->discovery.state != DISCOVERY_STARTING) + if (old_state != DISCOVERY_STARTING) mgmt_discovering(hdev, 0); break; case DISCOVERY_STARTING: @@ -1995,8 +2043,6 @@ void hci_discovery_set_state(struct hci_dev *hdev, int state) case DISCOVERY_STOPPING: break; } - - hdev->discovery.state = state; } void hci_inquiry_cache_flush(struct hci_dev *hdev) @@ -2083,22 +2129,24 @@ void hci_inquiry_cache_update_resolve(struct hci_dev *hdev, list_add(&ie->list, pos); } -bool hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data, - bool name_known, bool *ssp) +u32 hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data, + bool name_known) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *ie; + u32 flags = 0; BT_DBG("cache %p, %pMR", cache, &data->bdaddr); hci_remove_remote_oob_data(hdev, &data->bdaddr); - *ssp = data->ssp_mode; + if (!data->ssp_mode) + flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; ie = hci_inquiry_cache_lookup(hdev, &data->bdaddr); if (ie) { - if (ie->data.ssp_mode) - *ssp = true; + if (!ie->data.ssp_mode) + flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; if (ie->name_state == NAME_NEEDED && data->rssi != ie->data.rssi) { @@ -2110,9 +2158,11 @@ bool hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data, } /* Entry not in the cache. Add new one. */ - ie = kzalloc(sizeof(struct inquiry_entry), GFP_ATOMIC); - if (!ie) - return false; + ie = kzalloc(sizeof(*ie), GFP_KERNEL); + if (!ie) { + flags |= MGMT_DEV_FOUND_CONFIRM_NAME; + goto done; + } list_add(&ie->all, &cache->all); @@ -2135,9 +2185,10 @@ update: cache->timestamp = jiffies; if (ie->name_state == NAME_NOT_KNOWN) - return false; + flags |= MGMT_DEV_FOUND_CONFIRM_NAME; - return true; +done: + return flags; } static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf) @@ -2186,12 +2237,6 @@ static void hci_inq_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp); } -static int wait_inquiry(void *word) -{ - schedule(); - return signal_pending(current); -} - int hci_inquiry(void __user *arg) { __u8 __user *ptr = arg; @@ -2213,6 +2258,11 @@ int hci_inquiry(void __user *arg) goto done; } + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { + err = -EOPNOTSUPP; + goto done; + } + if (hdev->dev_type != HCI_BREDR) { err = -EOPNOTSUPP; goto done; @@ -2242,7 +2292,7 @@ int hci_inquiry(void __user *arg) /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is * cleared). If it is interrupted by a signal, return -EINTR. */ - if (wait_on_bit(&hdev->flags, HCI_INQUIRY, wait_inquiry, + if (wait_on_bit(&hdev->flags, HCI_INQUIRY, TASK_INTERRUPTIBLE)) return -EINTR; } @@ -2295,7 +2345,8 @@ static int hci_dev_do_open(struct hci_dev *hdev) goto done; } - if (!test_bit(HCI_SETUP, &hdev->dev_flags)) { + if (!test_bit(HCI_SETUP, &hdev->dev_flags) && + !test_bit(HCI_CONFIG, &hdev->dev_flags)) { /* Check for rfkill but allow the HCI setup stage to * proceed (which in itself doesn't cause any RF activity). */ @@ -2338,14 +2389,47 @@ static int hci_dev_do_open(struct hci_dev *hdev) atomic_set(&hdev->cmd_cnt, 1); set_bit(HCI_INIT, &hdev->flags); - if (hdev->setup && test_bit(HCI_SETUP, &hdev->dev_flags)) - ret = hdev->setup(hdev); + if (test_bit(HCI_SETUP, &hdev->dev_flags)) { + if (hdev->setup) + ret = hdev->setup(hdev); - if (!ret) { - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) - set_bit(HCI_RAW, &hdev->flags); + /* The transport driver can set these quirks before + * creating the HCI device or in its setup callback. + * + * In case any of them is set, the controller has to + * start up as unconfigured. + */ + if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || + test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks)) + set_bit(HCI_UNCONFIGURED, &hdev->dev_flags); - if (!test_bit(HCI_RAW, &hdev->flags) && + /* For an unconfigured controller it is required to + * read at least the version information provided by + * the Read Local Version Information command. + * + * If the set_bdaddr driver callback is provided, then + * also the original Bluetooth public device address + * will be read using the Read BD Address command. + */ + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + ret = __hci_unconf_init(hdev); + } + + if (test_bit(HCI_CONFIG, &hdev->dev_flags)) { + /* If public address change is configured, ensure that + * the address gets programmed. If the driver does not + * support changing the public address, fail the power + * on procedure. + */ + if (bacmp(&hdev->public_addr, BDADDR_ANY) && + hdev->set_bdaddr) + ret = hdev->set_bdaddr(hdev, &hdev->public_addr); + else + ret = -EADDRNOTAVAIL; + } + + if (!ret) { + if (!test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) && !test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) ret = __hci_init(hdev); } @@ -2358,6 +2442,8 @@ static int hci_dev_do_open(struct hci_dev *hdev) set_bit(HCI_UP, &hdev->flags); hci_notify(hdev, HCI_DEV_UP); if (!test_bit(HCI_SETUP, &hdev->dev_flags) && + !test_bit(HCI_CONFIG, &hdev->dev_flags) && + !test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) && !test_bit(HCI_USER_CHANNEL, &hdev->dev_flags) && hdev->dev_type == HCI_BREDR) { hci_dev_lock(hdev); @@ -2382,7 +2468,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) } hdev->close(hdev); - hdev->flags = 0; + hdev->flags &= BIT(HCI_RAW); } done: @@ -2401,6 +2487,21 @@ int hci_dev_open(__u16 dev) if (!hdev) return -ENODEV; + /* Devices that are marked as unconfigured can only be powered + * up as user channel. Trying to bring them up as normal devices + * will result into a failure. Only user channel operation is + * possible. + * + * When this function is called for a user channel, the flag + * HCI_USER_CHANNEL will be set first before attempting to + * open the device. + */ + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) && + !test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { + err = -EOPNOTSUPP; + goto done; + } + /* We need to ensure that no other power on/off work is pending * before proceeding to call hci_dev_do_open. This is * particularly important if the setup procedure has not yet @@ -2415,13 +2516,40 @@ int hci_dev_open(__u16 dev) */ flush_workqueue(hdev->req_workqueue); + /* For controllers not using the management interface and that + * are brought up using legacy ioctl, set the HCI_BONDABLE bit + * so that pairing works for them. Once the management interface + * is in use this bit will be cleared again and userspace has + * to explicitly enable it. + */ + if (!test_bit(HCI_USER_CHANNEL, &hdev->dev_flags) && + !test_bit(HCI_MGMT, &hdev->dev_flags)) + set_bit(HCI_BONDABLE, &hdev->dev_flags); + err = hci_dev_do_open(hdev); +done: hci_dev_put(hdev); - return err; } +/* This function requires the caller holds hdev->lock */ +static void hci_pend_le_actions_clear(struct hci_dev *hdev) +{ + struct hci_conn_params *p; + + list_for_each_entry(p, &hdev->le_conn_params, list) { + if (p->conn) { + hci_conn_drop(p->conn); + hci_conn_put(p->conn); + p->conn = NULL; + } + list_del_init(&p->action); + } + + BT_DBG("All LE pending actions cleared"); +} + static int hci_dev_do_close(struct hci_dev *hdev) { BT_DBG("%s %p", hdev->name, hdev); @@ -2432,7 +2560,7 @@ static int hci_dev_do_close(struct hci_dev *hdev) hci_req_lock(hdev); if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { - del_timer_sync(&hdev->cmd_timer); + cancel_delayed_work_sync(&hdev->cmd_timer); hci_req_unlock(hdev); return 0; } @@ -2458,8 +2586,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) hci_dev_lock(hdev); hci_inquiry_cache_flush(hdev); + hci_pend_le_actions_clear(hdev); hci_conn_hash_flush(hdev); - hci_pend_le_conns_clear(hdev); hci_dev_unlock(hdev); hci_notify(hdev, HCI_DEV_DOWN); @@ -2470,8 +2598,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); - if (!test_bit(HCI_RAW, &hdev->flags) && - !test_bit(HCI_AUTO_OFF, &hdev->dev_flags) && + if (!test_bit(HCI_AUTO_OFF, &hdev->dev_flags) && + !test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) && test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { set_bit(HCI_INIT, &hdev->flags); __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT); @@ -2488,7 +2616,7 @@ static int hci_dev_do_close(struct hci_dev *hdev) /* Drop last sent command */ if (hdev->sent_cmd) { - del_timer_sync(&hdev->cmd_timer); + cancel_delayed_work_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } @@ -2501,7 +2629,7 @@ static int hci_dev_do_close(struct hci_dev *hdev) hdev->close(hdev); /* Clear flags */ - hdev->flags = 0; + hdev->flags &= BIT(HCI_RAW); hdev->dev_flags &= ~HCI_PERSISTENT_MASK; if (!test_and_clear_bit(HCI_AUTO_OFF, &hdev->dev_flags)) { @@ -2570,6 +2698,11 @@ int hci_dev_reset(__u16 dev) goto done; } + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { + ret = -EOPNOTSUPP; + goto done; + } + /* Drop queues */ skb_queue_purge(&hdev->rx_q); skb_queue_purge(&hdev->cmd_q); @@ -2585,8 +2718,7 @@ int hci_dev_reset(__u16 dev) atomic_set(&hdev->cmd_cnt, 1); hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0; - if (!test_bit(HCI_RAW, &hdev->flags)) - ret = __hci_req_sync(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT); + ret = __hci_req_sync(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT); done: hci_req_unlock(hdev); @@ -2608,6 +2740,11 @@ int hci_dev_reset_stat(__u16 dev) goto done; } + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { + ret = -EOPNOTSUPP; + goto done; + } + memset(&hdev->stat, 0, sizeof(struct hci_dev_stats)); done: @@ -2615,6 +2752,42 @@ done: return ret; } +static void hci_update_scan_state(struct hci_dev *hdev, u8 scan) +{ + bool conn_changed, discov_changed; + + BT_DBG("%s scan 0x%02x", hdev->name, scan); + + if ((scan & SCAN_PAGE)) + conn_changed = !test_and_set_bit(HCI_CONNECTABLE, + &hdev->dev_flags); + else + conn_changed = test_and_clear_bit(HCI_CONNECTABLE, + &hdev->dev_flags); + + if ((scan & SCAN_INQUIRY)) { + discov_changed = !test_and_set_bit(HCI_DISCOVERABLE, + &hdev->dev_flags); + } else { + clear_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags); + discov_changed = test_and_clear_bit(HCI_DISCOVERABLE, + &hdev->dev_flags); + } + + if (!test_bit(HCI_MGMT, &hdev->dev_flags)) + return; + + if (conn_changed || discov_changed) { + /* In case this was disabled through mgmt */ + set_bit(HCI_BREDR_ENABLED, &hdev->dev_flags); + + if (test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) + mgmt_update_adv_data(hdev); + + mgmt_new_settings(hdev); + } +} + int hci_dev_cmd(unsigned int cmd, void __user *arg) { struct hci_dev *hdev; @@ -2633,6 +2806,11 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) goto done; } + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { + err = -EOPNOTSUPP; + goto done; + } + if (hdev->dev_type != HCI_BREDR) { err = -EOPNOTSUPP; goto done; @@ -2670,6 +2848,12 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) case HCISETSCAN: err = hci_req_sync(hdev, hci_scan_req, dr.dev_opt, HCI_INIT_TIMEOUT); + + /* Ensure that the connectable and discoverable states + * get correctly modified as this was a non-mgmt change. + */ + if (!err) + hci_update_scan_state(hdev, dr.dev_opt); break; case HCISETLINKPOL: @@ -2730,14 +2914,17 @@ int hci_get_dev_list(void __user *arg) read_lock(&hci_dev_list_lock); list_for_each_entry(hdev, &hci_dev_list, list) { - if (test_and_clear_bit(HCI_AUTO_OFF, &hdev->dev_flags)) - cancel_delayed_work(&hdev->power_off); + unsigned long flags = hdev->flags; - if (!test_bit(HCI_MGMT, &hdev->dev_flags)) - set_bit(HCI_PAIRABLE, &hdev->dev_flags); + /* When the auto-off is configured it means the transport + * is running, but in that case still indicate that the + * device is actually down. + */ + if (test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) + flags &= ~BIT(HCI_UP); (dr + n)->dev_id = hdev->id; - (dr + n)->dev_opt = hdev->flags; + (dr + n)->dev_opt = flags; if (++n >= dev_num) break; @@ -2757,6 +2944,7 @@ int hci_get_dev_info(void __user *arg) { struct hci_dev *hdev; struct hci_dev_info di; + unsigned long flags; int err = 0; if (copy_from_user(&di, arg, sizeof(di))) @@ -2766,16 +2954,19 @@ int hci_get_dev_info(void __user *arg) if (!hdev) return -ENODEV; - if (test_and_clear_bit(HCI_AUTO_OFF, &hdev->dev_flags)) - cancel_delayed_work_sync(&hdev->power_off); - - if (!test_bit(HCI_MGMT, &hdev->dev_flags)) - set_bit(HCI_PAIRABLE, &hdev->dev_flags); + /* When the auto-off is configured it means the transport + * is running, but in that case still indicate that the + * device is actually down. + */ + if (test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) + flags = hdev->flags & ~BIT(HCI_UP); + else + flags = hdev->flags; strcpy(di.name, hdev->name); di.bdaddr = hdev->bdaddr; di.type = (hdev->bus & 0x0f) | ((hdev->dev_type & 0x03) << 4); - di.flags = hdev->flags; + di.flags = flags; di.pkt_type = hdev->pkt_type; if (lmp_bredr_capable(hdev)) { di.acl_mtu = hdev->acl_mtu; @@ -2815,7 +3006,8 @@ static int hci_rfkill_set_block(void *data, bool blocked) if (blocked) { set_bit(HCI_RFKILLED, &hdev->dev_flags); - if (!test_bit(HCI_SETUP, &hdev->dev_flags)) + if (!test_bit(HCI_SETUP, &hdev->dev_flags) && + !test_bit(HCI_CONFIG, &hdev->dev_flags)) hci_dev_do_close(hdev); } else { clear_bit(HCI_RFKILLED, &hdev->dev_flags); @@ -2846,6 +3038,7 @@ static void hci_power_on(struct work_struct *work) * valid, it is important to turn the device back off. */ if (test_bit(HCI_RFKILLED, &hdev->dev_flags) || + test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) || (hdev->dev_type == HCI_BREDR && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { @@ -2856,8 +3049,34 @@ static void hci_power_on(struct work_struct *work) HCI_AUTO_OFF_TIMEOUT); } - if (test_and_clear_bit(HCI_SETUP, &hdev->dev_flags)) + if (test_and_clear_bit(HCI_SETUP, &hdev->dev_flags)) { + /* For unconfigured devices, set the HCI_RAW flag + * so that userspace can easily identify them. + */ + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + set_bit(HCI_RAW, &hdev->flags); + + /* For fully configured devices, this will send + * the Index Added event. For unconfigured devices, + * it will send Unconfigued Index Added event. + * + * Devices with HCI_QUIRK_RAW_DEVICE are ignored + * and no event will be send. + */ mgmt_index_added(hdev); + } else if (test_and_clear_bit(HCI_CONFIG, &hdev->dev_flags)) { + /* When the controller is now configured, then it + * is important to clear the HCI_RAW flag. + */ + if (!test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + clear_bit(HCI_RAW, &hdev->flags); + + /* Powering on the controller with HCI_CONFIG set only + * happens with the transition from unconfigured to + * configured. This will send the Index Added event. + */ + mgmt_index_added(hdev); + } } static void hci_power_off(struct work_struct *work) @@ -2972,16 +3191,16 @@ static bool hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn, return false; } -static bool ltk_type_master(u8 type) +static u8 ltk_role(u8 type) { - if (type == HCI_SMP_STK || type == HCI_SMP_LTK) - return true; + if (type == SMP_LTK) + return HCI_ROLE_MASTER; - return false; + return HCI_ROLE_SLAVE; } struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, __le16 ediv, __le64 rand, - bool master) + u8 role) { struct smp_ltk *k; @@ -2989,7 +3208,7 @@ struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, __le16 ediv, __le64 rand, if (k->ediv != ediv || k->rand != rand) continue; - if (ltk_type_master(k->type) != master) + if (ltk_role(k->type) != role) continue; return k; @@ -2999,14 +3218,14 @@ struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, __le16 ediv, __le64 rand, } struct smp_ltk *hci_find_ltk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, - u8 addr_type, bool master) + u8 addr_type, u8 role) { struct smp_ltk *k; list_for_each_entry(k, &hdev->long_term_keys, list) if (addr_type == k->bdaddr_type && bacmp(bdaddr, &k->bdaddr) == 0 && - ltk_type_master(k->type) == master) + ltk_role(k->type) == role) return k; return NULL; @@ -3022,7 +3241,7 @@ struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) } list_for_each_entry(irk, &hdev->identity_resolving_keys, list) { - if (smp_irk_matches(hdev->tfm_aes, irk->val, rpa)) { + if (smp_irk_matches(hdev, irk->val, rpa)) { bacpy(&irk->rpa, rpa); return irk; } @@ -3049,12 +3268,12 @@ struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, return NULL; } -int hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, int new_key, - bdaddr_t *bdaddr, u8 *val, u8 type, u8 pin_len) +struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, + bdaddr_t *bdaddr, u8 *val, u8 type, + u8 pin_len, bool *persistent) { struct link_key *key, *old_key; u8 old_key_type; - bool persistent; old_key = hci_find_link_key(hdev, bdaddr); if (old_key) { @@ -3064,7 +3283,7 @@ int hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, int new_key, old_key_type = conn ? conn->key_type : 0xff; key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) - return -ENOMEM; + return NULL; list_add(&key->list, &hdev->link_keys); } @@ -3089,17 +3308,11 @@ int hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, int new_key, else key->type = type; - if (!new_key) - return 0; - - persistent = hci_persistent_key(hdev, conn, type, old_key_type); - - mgmt_new_link_key(hdev, key, persistent); + if (persistent) + *persistent = hci_persistent_key(hdev, conn, type, + old_key_type); - if (conn) - conn->flush_key = !persistent; - - return 0; + return key; } struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, @@ -3107,9 +3320,9 @@ struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 tk[16], u8 enc_size, __le16 ediv, __le64 rand) { struct smp_ltk *key, *old_key; - bool master = ltk_type_master(type); + u8 role = ltk_role(type); - old_key = hci_find_ltk_by_addr(hdev, bdaddr, addr_type, master); + old_key = hci_find_ltk_by_addr(hdev, bdaddr, addr_type, role); if (old_key) key = old_key; else { @@ -3205,9 +3418,10 @@ void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) } /* HCI command timer function */ -static void hci_cmd_timeout(unsigned long arg) +static void hci_cmd_timeout(struct work_struct *work) { - struct hci_dev *hdev = (void *) arg; + struct hci_dev *hdev = container_of(work, struct hci_dev, + cmd_timer.work); if (hdev->sent_cmd) { struct hci_command_hdr *sent = (void *) hdev->sent_cmd->data; @@ -3313,12 +3527,12 @@ int hci_add_remote_oob_ext_data(struct hci_dev *hdev, bdaddr_t *bdaddr, return 0; } -struct bdaddr_list *hci_blacklist_lookup(struct hci_dev *hdev, +struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *b; - list_for_each_entry(b, &hdev->blacklist, list) { + list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } @@ -3326,11 +3540,11 @@ struct bdaddr_list *hci_blacklist_lookup(struct hci_dev *hdev, return NULL; } -static void hci_blacklist_clear(struct hci_dev *hdev) +void hci_bdaddr_list_clear(struct list_head *bdaddr_list) { struct list_head *p, *n; - list_for_each_safe(p, n, &hdev->blacklist) { + list_for_each_safe(p, n, bdaddr_list) { struct bdaddr_list *b = list_entry(p, struct bdaddr_list, list); list_del(p); @@ -3338,99 +3552,38 @@ static void hci_blacklist_clear(struct hci_dev *hdev) } } -int hci_blacklist_add(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) +int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; - if (hci_blacklist_lookup(hdev, bdaddr, type)) + if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; - entry = kzalloc(sizeof(struct bdaddr_list), GFP_KERNEL); + entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; - list_add(&entry->list, &hdev->blacklist); + list_add(&entry->list, list); - return mgmt_device_blocked(hdev, bdaddr, type); + return 0; } -int hci_blacklist_del(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) +int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { - hci_blacklist_clear(hdev); + hci_bdaddr_list_clear(list); return 0; } - entry = hci_blacklist_lookup(hdev, bdaddr, type); - if (!entry) - return -ENOENT; - - list_del(&entry->list); - kfree(entry); - - return mgmt_device_unblocked(hdev, bdaddr, type); -} - -struct bdaddr_list *hci_white_list_lookup(struct hci_dev *hdev, - bdaddr_t *bdaddr, u8 type) -{ - struct bdaddr_list *b; - - list_for_each_entry(b, &hdev->le_white_list, list) { - if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) - return b; - } - - return NULL; -} - -void hci_white_list_clear(struct hci_dev *hdev) -{ - struct list_head *p, *n; - - list_for_each_safe(p, n, &hdev->le_white_list) { - struct bdaddr_list *b = list_entry(p, struct bdaddr_list, list); - - list_del(p); - kfree(b); - } -} - -int hci_white_list_add(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) -{ - struct bdaddr_list *entry; - - if (!bacmp(bdaddr, BDADDR_ANY)) - return -EBADF; - - entry = kzalloc(sizeof(struct bdaddr_list), GFP_KERNEL); - if (!entry) - return -ENOMEM; - - bacpy(&entry->bdaddr, bdaddr); - entry->bdaddr_type = type; - - list_add(&entry->list, &hdev->le_white_list); - - return 0; -} - -int hci_white_list_del(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) -{ - struct bdaddr_list *entry; - - if (!bacmp(bdaddr, BDADDR_ANY)) - return -EBADF; - - entry = hci_white_list_lookup(hdev, bdaddr, type); + entry = hci_bdaddr_list_lookup(list, bdaddr, type); if (!entry) return -ENOENT; @@ -3446,6 +3599,10 @@ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, { struct hci_conn_params *params; + /* The conn params list only contains identity addresses */ + if (!hci_is_identity_address(addr, addr_type)) + return NULL; + list_for_each_entry(params, &hdev->le_conn_params, list) { if (bacmp(¶ms->addr, addr) == 0 && params->addr_type == addr_type) { @@ -3473,66 +3630,114 @@ static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) return true; } -static bool is_identity_address(bdaddr_t *addr, u8 addr_type) +/* This function requires the caller holds hdev->lock */ +struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, + bdaddr_t *addr, u8 addr_type) { - if (addr_type == ADDR_LE_DEV_PUBLIC) - return true; + struct hci_conn_params *param; - /* Check for Random Static address type */ - if ((addr->b[5] & 0xc0) == 0xc0) - return true; + /* The list only contains identity addresses */ + if (!hci_is_identity_address(addr, addr_type)) + return NULL; - return false; + list_for_each_entry(param, list, action) { + if (bacmp(¶m->addr, addr) == 0 && + param->addr_type == addr_type) + return param; + } + + return NULL; } /* This function requires the caller holds hdev->lock */ -int hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type, - u8 auto_connect, u16 conn_min_interval, - u16 conn_max_interval) +struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, + bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; - if (!is_identity_address(addr, addr_type)) - return -EINVAL; + if (!hci_is_identity_address(addr, addr_type)) + return NULL; params = hci_conn_params_lookup(hdev, addr, addr_type); if (params) - goto update; + return params; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) { BT_ERR("Out of memory"); - return -ENOMEM; + return NULL; } bacpy(¶ms->addr, addr); params->addr_type = addr_type; list_add(¶ms->list, &hdev->le_conn_params); + INIT_LIST_HEAD(¶ms->action); -update: - params->conn_min_interval = conn_min_interval; - params->conn_max_interval = conn_max_interval; - params->auto_connect = auto_connect; + params->conn_min_interval = hdev->le_conn_min_interval; + params->conn_max_interval = hdev->le_conn_max_interval; + params->conn_latency = hdev->le_conn_latency; + params->supervision_timeout = hdev->le_supv_timeout; + params->auto_connect = HCI_AUTO_CONN_DISABLED; + + BT_DBG("addr %pMR (type %u)", addr, addr_type); + + return params; +} + +/* This function requires the caller holds hdev->lock */ +int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type, + u8 auto_connect) +{ + struct hci_conn_params *params; + + params = hci_conn_params_add(hdev, addr, addr_type); + if (!params) + return -EIO; + + if (params->auto_connect == auto_connect) + return 0; + + list_del_init(¶ms->action); switch (auto_connect) { case HCI_AUTO_CONN_DISABLED: case HCI_AUTO_CONN_LINK_LOSS: - hci_pend_le_conn_del(hdev, addr, addr_type); + hci_update_background_scan(hdev); break; + case HCI_AUTO_CONN_REPORT: + list_add(¶ms->action, &hdev->pend_le_reports); + hci_update_background_scan(hdev); + break; + case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: - if (!is_connected(hdev, addr, addr_type)) - hci_pend_le_conn_add(hdev, addr, addr_type); + if (!is_connected(hdev, addr, addr_type)) { + list_add(¶ms->action, &hdev->pend_le_conns); + hci_update_background_scan(hdev); + } break; } - BT_DBG("addr %pMR (type %u) auto_connect %u conn_min_interval 0x%.4x " - "conn_max_interval 0x%.4x", addr, addr_type, auto_connect, - conn_min_interval, conn_max_interval); + params->auto_connect = auto_connect; + + BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type, + auto_connect); return 0; } +static void hci_conn_params_free(struct hci_conn_params *params) +{ + if (params->conn) { + hci_conn_drop(params->conn); + hci_conn_put(params->conn); + } + + list_del(¶ms->action); + list_del(¶ms->list); + kfree(params); +} + /* This function requires the caller holds hdev->lock */ void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { @@ -3542,97 +3747,39 @@ void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) if (!params) return; - hci_pend_le_conn_del(hdev, addr, addr_type); + hci_conn_params_free(params); - list_del(¶ms->list); - kfree(params); + hci_update_background_scan(hdev); BT_DBG("addr %pMR (type %u)", addr, addr_type); } /* This function requires the caller holds hdev->lock */ -void hci_conn_params_clear(struct hci_dev *hdev) +void hci_conn_params_clear_disabled(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { + if (params->auto_connect != HCI_AUTO_CONN_DISABLED) + continue; list_del(¶ms->list); kfree(params); } - BT_DBG("All LE connection parameters were removed"); + BT_DBG("All LE disabled connection parameters were removed"); } /* This function requires the caller holds hdev->lock */ -struct bdaddr_list *hci_pend_le_conn_lookup(struct hci_dev *hdev, - bdaddr_t *addr, u8 addr_type) +void hci_conn_params_clear_all(struct hci_dev *hdev) { - struct bdaddr_list *entry; - - list_for_each_entry(entry, &hdev->pend_le_conns, list) { - if (bacmp(&entry->bdaddr, addr) == 0 && - entry->bdaddr_type == addr_type) - return entry; - } - - return NULL; -} - -/* This function requires the caller holds hdev->lock */ -void hci_pend_le_conn_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) -{ - struct bdaddr_list *entry; - - entry = hci_pend_le_conn_lookup(hdev, addr, addr_type); - if (entry) - goto done; - - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) { - BT_ERR("Out of memory"); - return; - } - - bacpy(&entry->bdaddr, addr); - entry->bdaddr_type = addr_type; - - list_add(&entry->list, &hdev->pend_le_conns); - - BT_DBG("addr %pMR (type %u)", addr, addr_type); - -done: - hci_update_background_scan(hdev); -} - -/* This function requires the caller holds hdev->lock */ -void hci_pend_le_conn_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) -{ - struct bdaddr_list *entry; - - entry = hci_pend_le_conn_lookup(hdev, addr, addr_type); - if (!entry) - goto done; - - list_del(&entry->list); - kfree(entry); + struct hci_conn_params *params, *tmp; - BT_DBG("addr %pMR (type %u)", addr, addr_type); + list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) + hci_conn_params_free(params); -done: hci_update_background_scan(hdev); -} - -/* This function requires the caller holds hdev->lock */ -void hci_pend_le_conns_clear(struct hci_dev *hdev) -{ - struct bdaddr_list *entry, *tmp; - - list_for_each_entry_safe(entry, tmp, &hdev->pend_le_conns, list) { - list_del(&entry->list); - kfree(entry); - } - BT_DBG("All LE pending connections cleared"); + BT_DBG("All LE connection parameters were removed"); } static void inquiry_complete(struct hci_dev *hdev, u8 status) @@ -3722,9 +3869,10 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) * In this kind of scenario skip the update and let the random * address be updated at the next cycle. */ - if (test_bit(HCI_ADVERTISING, &hdev->dev_flags) || + if (test_bit(HCI_LE_ADV, &hdev->dev_flags) || hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) { BT_DBG("Deferring random address update"); + set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); return; } @@ -3750,7 +3898,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, !bacmp(&hdev->random_addr, &hdev->rpa)) return 0; - err = smp_generate_rpa(hdev->tfm_aes, hdev->irk, &hdev->rpa); + err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); if (err < 0) { BT_ERR("%s failed to generate new RPA", hdev->name); return err; @@ -3784,7 +3932,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, * the HCI command if the current random address is already the * static one. */ - if (test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dev_flags) || + if (test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dbg_flags) || !bacmp(&hdev->bdaddr, BDADDR_ANY)) { *own_addr_type = ADDR_LE_DEV_RANDOM; if (bacmp(&hdev->static_addr, &hdev->random_addr)) @@ -3813,7 +3961,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *bdaddr_type) { - if (test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dev_flags) || + if (test_bit(HCI_FORCE_STATIC_ADDR, &hdev->dbg_flags) || !bacmp(&hdev->bdaddr, BDADDR_ANY)) { bacpy(bdaddr, &hdev->static_addr); *bdaddr_type = ADDR_LE_DEV_RANDOM; @@ -3828,7 +3976,7 @@ struct hci_dev *hci_alloc_dev(void) { struct hci_dev *hdev; - hdev = kzalloc(sizeof(struct hci_dev), GFP_KERNEL); + hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); if (!hdev) return NULL; @@ -3837,6 +3985,7 @@ struct hci_dev *hci_alloc_dev(void) hdev->link_mode = (HCI_LM_ACCEPT); hdev->num_iac = 0x01; /* One IAC support is mandatory */ hdev->io_capability = 0x03; /* No Input No Output */ + hdev->manufacturer = 0xffff; /* Default to internal use */ hdev->inq_tx_power = HCI_TX_POWER_INVALID; hdev->adv_tx_power = HCI_TX_POWER_INVALID; @@ -3844,10 +3993,14 @@ struct hci_dev *hci_alloc_dev(void) hdev->sniff_min_interval = 80; hdev->le_adv_channel_map = 0x07; + hdev->le_adv_min_interval = 0x0800; + hdev->le_adv_max_interval = 0x0800; hdev->le_scan_interval = 0x0060; hdev->le_scan_window = 0x0030; hdev->le_conn_min_interval = 0x0028; hdev->le_conn_max_interval = 0x0038; + hdev->le_conn_latency = 0x0000; + hdev->le_supv_timeout = 0x002a; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; @@ -3859,6 +4012,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->mgmt_pending); INIT_LIST_HEAD(&hdev->blacklist); + INIT_LIST_HEAD(&hdev->whitelist); INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); @@ -3867,6 +4021,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->le_white_list); INIT_LIST_HEAD(&hdev->le_conn_params); INIT_LIST_HEAD(&hdev->pend_le_conns); + INIT_LIST_HEAD(&hdev->pend_le_reports); INIT_LIST_HEAD(&hdev->conn_hash.list); INIT_WORK(&hdev->rx_work, hci_rx_work); @@ -3884,7 +4039,7 @@ struct hci_dev *hci_alloc_dev(void) init_waitqueue_head(&hdev->req_wait_q); - setup_timer(&hdev->cmd_timer, hci_cmd_timeout, (unsigned long) hdev); + INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout); hci_init_sysfs(hdev); discovery_init(hdev); @@ -3906,7 +4061,7 @@ int hci_register_dev(struct hci_dev *hdev) { int id, error; - if (!hdev->open || !hdev->close) + if (!hdev->open || !hdev->close || !hdev->send) return -EINVAL; /* Do not allow HCI_AMP devices to register at index 0, @@ -3951,18 +4106,9 @@ int hci_register_dev(struct hci_dev *hdev) dev_set_name(&hdev->dev, "%s", hdev->name); - hdev->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(hdev->tfm_aes)) { - BT_ERR("Unable to create crypto context"); - error = PTR_ERR(hdev->tfm_aes); - hdev->tfm_aes = NULL; - goto err_wqueue; - } - error = device_add(&hdev->dev); if (error < 0) - goto err_tfm; + goto err_wqueue; hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, @@ -3991,6 +4137,12 @@ int hci_register_dev(struct hci_dev *hdev) list_add(&hdev->list, &hci_dev_list); write_unlock(&hci_dev_list_lock); + /* Devices that are marked for raw-only usage are unconfigured + * and should not be included in normal operation. + */ + if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + set_bit(HCI_UNCONFIGURED, &hdev->dev_flags); + hci_notify(hdev, HCI_DEV_REG); hci_dev_hold(hdev); @@ -3998,8 +4150,6 @@ int hci_register_dev(struct hci_dev *hdev) return id; -err_tfm: - crypto_free_blkcipher(hdev->tfm_aes); err_wqueue: destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); @@ -4033,7 +4183,8 @@ void hci_unregister_dev(struct hci_dev *hdev) cancel_work_sync(&hdev->power_on); if (!test_bit(HCI_INIT, &hdev->flags) && - !test_bit(HCI_SETUP, &hdev->dev_flags)) { + !test_bit(HCI_SETUP, &hdev->dev_flags) && + !test_bit(HCI_CONFIG, &hdev->dev_flags)) { hci_dev_lock(hdev); mgmt_index_removed(hdev); hci_dev_unlock(hdev); @@ -4050,8 +4201,7 @@ void hci_unregister_dev(struct hci_dev *hdev) rfkill_destroy(hdev->rfkill); } - if (hdev->tfm_aes) - crypto_free_blkcipher(hdev->tfm_aes); + smp_unregister(hdev); device_del(&hdev->dev); @@ -4061,15 +4211,15 @@ void hci_unregister_dev(struct hci_dev *hdev) destroy_workqueue(hdev->req_workqueue); hci_dev_lock(hdev); - hci_blacklist_clear(hdev); + hci_bdaddr_list_clear(&hdev->blacklist); + hci_bdaddr_list_clear(&hdev->whitelist); hci_uuids_clear(hdev); hci_link_keys_clear(hdev); hci_smp_ltks_clear(hdev); hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); - hci_white_list_clear(hdev); - hci_conn_params_clear(hdev); - hci_pend_le_conns_clear(hdev); + hci_bdaddr_list_clear(&hdev->le_white_list); + hci_conn_params_clear_all(hdev); hci_dev_unlock(hdev); hci_dev_put(hdev); @@ -4224,26 +4374,6 @@ static int hci_reassembly(struct hci_dev *hdev, int type, void *data, return remain; } -int hci_recv_fragment(struct hci_dev *hdev, int type, void *data, int count) -{ - int rem = 0; - - if (type < HCI_ACLDATA_PKT || type > HCI_EVENT_PKT) - return -EILSEQ; - - while (count) { - rem = hci_reassembly(hdev, type, data, count, type - 1); - if (rem < 0) - return rem; - - data += (count - rem); - count = rem; - } - - return rem; -} -EXPORT_SYMBOL(hci_recv_fragment); - #define STREAM_REASSEMBLY 0 int hci_recv_stream_fragment(struct hci_dev *hdev, void *data, int count) @@ -4307,6 +4437,8 @@ EXPORT_SYMBOL(hci_unregister_cb); static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) { + int err; + BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len); /* Time stamp */ @@ -4323,8 +4455,11 @@ static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) /* Get rid of skb owner, prior to sending to the driver. */ skb_orphan(skb); - if (hdev->send(hdev, skb) < 0) - BT_ERR("%s sending frame failed", hdev->name); + err = hdev->send(hdev, skb); + if (err < 0) { + BT_ERR("%s sending frame failed (%d)", hdev->name, err); + kfree_skb(skb); + } } void hci_req_init(struct hci_request *req, struct hci_dev *hdev) @@ -4366,6 +4501,11 @@ int hci_req_run(struct hci_request *req, hci_req_complete_t complete) return 0; } +bool hci_req_pending(struct hci_dev *hdev) +{ + return (hdev->req_status == HCI_REQ_PEND); +} + static struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param) { @@ -4387,6 +4527,7 @@ static struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, BT_DBG("skb len %d", skb->len); bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; + bt_cb(skb)->opcode = opcode; return skb; } @@ -4798,7 +4939,7 @@ static inline int __get_blocks(struct hci_dev *hdev, struct sk_buff *skb) static void __check_timeout(struct hci_dev *hdev, unsigned int cnt) { - if (!test_bit(HCI_RAW, &hdev->flags)) { + if (!test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { /* ACL tx timeout must be longer than maximum * link supervision timeout (40.9 seconds) */ if (!cnt && time_after(jiffies, hdev->acl_last_tx + @@ -4981,7 +5122,7 @@ static void hci_sched_le(struct hci_dev *hdev) if (!hci_conn_num(hdev, LE_LINK)) return; - if (!test_bit(HCI_RAW, &hdev->flags)) { + if (!test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { /* LE tx timeout must be longer than maximum * link supervision timeout (40.9 seconds) */ if (!hdev->le_cnt && hdev->le_pkts && @@ -5226,8 +5367,7 @@ static void hci_rx_work(struct work_struct *work) hci_send_to_sock(hdev, skb); } - if (test_bit(HCI_RAW, &hdev->flags) || - test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { + if (test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { kfree_skb(skb); continue; } @@ -5287,10 +5427,10 @@ static void hci_cmd_work(struct work_struct *work) atomic_dec(&hdev->cmd_cnt); hci_send_frame(hdev, skb); if (test_bit(HCI_RESET, &hdev->flags)) - del_timer(&hdev->cmd_timer); + cancel_delayed_work(&hdev->cmd_timer); else - mod_timer(&hdev->cmd_timer, - jiffies + HCI_CMD_TIMEOUT); + schedule_delayed_work(&hdev->cmd_timer, + HCI_CMD_TIMEOUT); } else { skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); @@ -5307,26 +5447,135 @@ void hci_req_add_le_scan_disable(struct hci_request *req) hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); } +static void add_to_white_list(struct hci_request *req, + struct hci_conn_params *params) +{ + struct hci_cp_le_add_to_white_list cp; + + cp.bdaddr_type = params->addr_type; + bacpy(&cp.bdaddr, ¶ms->addr); + + hci_req_add(req, HCI_OP_LE_ADD_TO_WHITE_LIST, sizeof(cp), &cp); +} + +static u8 update_white_list(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + struct hci_conn_params *params; + struct bdaddr_list *b; + uint8_t white_list_entries = 0; + + /* Go through the current white list programmed into the + * controller one by one and check if that address is still + * in the list of pending connections or list of devices to + * report. If not present in either list, then queue the + * command to remove it from the controller. + */ + list_for_each_entry(b, &hdev->le_white_list, list) { + struct hci_cp_le_del_from_white_list cp; + + if (hci_pend_le_action_lookup(&hdev->pend_le_conns, + &b->bdaddr, b->bdaddr_type) || + hci_pend_le_action_lookup(&hdev->pend_le_reports, + &b->bdaddr, b->bdaddr_type)) { + white_list_entries++; + continue; + } + + cp.bdaddr_type = b->bdaddr_type; + bacpy(&cp.bdaddr, &b->bdaddr); + + hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, + sizeof(cp), &cp); + } + + /* Since all no longer valid white list entries have been + * removed, walk through the list of pending connections + * and ensure that any new device gets programmed into + * the controller. + * + * If the list of the devices is larger than the list of + * available white list entries in the controller, then + * just abort and return filer policy value to not use the + * white list. + */ + list_for_each_entry(params, &hdev->pend_le_conns, action) { + if (hci_bdaddr_list_lookup(&hdev->le_white_list, + ¶ms->addr, params->addr_type)) + continue; + + if (white_list_entries >= hdev->le_white_list_size) { + /* Select filter policy to accept all advertising */ + return 0x00; + } + + if (hci_find_irk_by_addr(hdev, ¶ms->addr, + params->addr_type)) { + /* White list can not be used with RPAs */ + return 0x00; + } + + white_list_entries++; + add_to_white_list(req, params); + } + + /* After adding all new pending connections, walk through + * the list of pending reports and also add these to the + * white list if there is still space. + */ + list_for_each_entry(params, &hdev->pend_le_reports, action) { + if (hci_bdaddr_list_lookup(&hdev->le_white_list, + ¶ms->addr, params->addr_type)) + continue; + + if (white_list_entries >= hdev->le_white_list_size) { + /* Select filter policy to accept all advertising */ + return 0x00; + } + + if (hci_find_irk_by_addr(hdev, ¶ms->addr, + params->addr_type)) { + /* White list can not be used with RPAs */ + return 0x00; + } + + white_list_entries++; + add_to_white_list(req, params); + } + + /* Select filter policy to use white list */ + return 0x01; +} + void hci_req_add_le_passive_scan(struct hci_request *req) { struct hci_cp_le_set_scan_param param_cp; struct hci_cp_le_set_scan_enable enable_cp; struct hci_dev *hdev = req->hdev; u8 own_addr_type; + u8 filter_policy; - /* Set require_privacy to true to avoid identification from - * unknown peer devices. Since this is passive scanning, no - * SCAN_REQ using the local identity should be sent. Mandating - * privacy is just an extra precaution. + /* Set require_privacy to false since no SCAN_REQ are send + * during passive scanning. Not using an unresolvable address + * here is important so that peer devices using direct + * advertising with our address will be correctly reported + * by the controller. */ - if (hci_update_random_address(req, true, &own_addr_type)) + if (hci_update_random_address(req, false, &own_addr_type)) return; + /* Adding or removing entries from the white list must + * happen before enabling scanning. The controller does + * not allow white list modification while scanning. + */ + filter_policy = update_white_list(req); + memset(¶m_cp, 0, sizeof(param_cp)); param_cp.type = LE_SCAN_PASSIVE; param_cp.interval = cpu_to_le16(hdev->le_scan_interval); param_cp.window = cpu_to_le16(hdev->le_scan_window); param_cp.own_address_type = own_addr_type; + param_cp.filter_policy = filter_policy; hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), ¶m_cp); @@ -5356,11 +5605,29 @@ void hci_update_background_scan(struct hci_dev *hdev) struct hci_conn *conn; int err; + if (!test_bit(HCI_UP, &hdev->flags) || + test_bit(HCI_INIT, &hdev->flags) || + test_bit(HCI_SETUP, &hdev->dev_flags) || + test_bit(HCI_CONFIG, &hdev->dev_flags) || + test_bit(HCI_AUTO_OFF, &hdev->dev_flags) || + test_bit(HCI_UNREGISTER, &hdev->dev_flags)) + return; + + /* No point in doing scanning if LE support hasn't been enabled */ + if (!test_bit(HCI_LE_ENABLED, &hdev->dev_flags)) + return; + + /* If discovery is active don't interfere with it */ + if (hdev->discovery.state != DISCOVERY_STOPPED) + return; + hci_req_init(&req, hdev); - if (list_empty(&hdev->pend_le_conns)) { - /* If there is no pending LE connections, we should stop - * the background scanning. + if (list_empty(&hdev->pend_le_conns) && + list_empty(&hdev->pend_le_reports)) { + /* If there is no pending LE connections or devices + * to be scanned for, we should stop the background + * scanning. */ /* If controller is not scanning we are done. */ @@ -5398,3 +5665,52 @@ void hci_update_background_scan(struct hci_dev *hdev) if (err) BT_ERR("Failed to run HCI request: err %d", err); } + +static bool disconnected_whitelist_entries(struct hci_dev *hdev) +{ + struct bdaddr_list *b; + + list_for_each_entry(b, &hdev->whitelist, list) { + struct hci_conn *conn; + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr); + if (!conn) + return true; + + if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) + return true; + } + + return false; +} + +void hci_update_page_scan(struct hci_dev *hdev, struct hci_request *req) +{ + u8 scan; + + if (!test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) + return; + + if (!hdev_is_powered(hdev)) + return; + + if (mgmt_powering_down(hdev)) + return; + + if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags) || + disconnected_whitelist_entries(hdev)) + scan = SCAN_PAGE; + else + scan = SCAN_DISABLED; + + if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE)) + return; + + if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) + scan |= SCAN_INQUIRY; + + if (req) + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + else + hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); +} diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 640c54ec1bd2..8b0a2a6de419 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -32,6 +32,7 @@ #include "a2mp.h" #include "amp.h" +#include "smp.h" /* Handle HCI Event packets */ @@ -100,12 +101,8 @@ static void hci_cc_role_discovery(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); - if (conn) { - if (rp->role) - conn->link_mode &= ~HCI_LM_MASTER; - else - conn->link_mode |= HCI_LM_MASTER; - } + if (conn) + conn->role = rp->role; hci_dev_unlock(hdev); } @@ -174,12 +171,14 @@ static void hci_cc_write_def_link_policy(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, status); + if (status) + return; + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_DEF_LINK_POLICY); if (!sent) return; - if (!status) - hdev->link_policy = get_unaligned_le16(sent); + hdev->link_policy = get_unaligned_le16(sent); } static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb) @@ -269,28 +268,30 @@ static void hci_cc_write_auth_enable(struct hci_dev *hdev, struct sk_buff *skb) static void hci_cc_write_encrypt_mode(struct hci_dev *hdev, struct sk_buff *skb) { __u8 status = *((__u8 *) skb->data); + __u8 param; void *sent; BT_DBG("%s status 0x%2.2x", hdev->name, status); + if (status) + return; + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_ENCRYPT_MODE); if (!sent) return; - if (!status) { - __u8 param = *((__u8 *) sent); + param = *((__u8 *) sent); - if (param) - set_bit(HCI_ENCRYPT, &hdev->flags); - else - clear_bit(HCI_ENCRYPT, &hdev->flags); - } + if (param) + set_bit(HCI_ENCRYPT, &hdev->flags); + else + clear_bit(HCI_ENCRYPT, &hdev->flags); } static void hci_cc_write_scan_enable(struct hci_dev *hdev, struct sk_buff *skb) { - __u8 param, status = *((__u8 *) skb->data); - int old_pscan, old_iscan; + __u8 status = *((__u8 *) skb->data); + __u8 param; void *sent; BT_DBG("%s status 0x%2.2x", hdev->name, status); @@ -304,32 +305,19 @@ static void hci_cc_write_scan_enable(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); if (status) { - mgmt_write_scan_failed(hdev, param, status); hdev->discov_timeout = 0; goto done; } - /* We need to ensure that we set this back on if someone changed - * the scan mode through a raw HCI socket. - */ - set_bit(HCI_BREDR_ENABLED, &hdev->dev_flags); - - old_pscan = test_and_clear_bit(HCI_PSCAN, &hdev->flags); - old_iscan = test_and_clear_bit(HCI_ISCAN, &hdev->flags); - - if (param & SCAN_INQUIRY) { + if (param & SCAN_INQUIRY) set_bit(HCI_ISCAN, &hdev->flags); - if (!old_iscan) - mgmt_discoverable(hdev, 1); - } else if (old_iscan) - mgmt_discoverable(hdev, 0); + else + clear_bit(HCI_ISCAN, &hdev->flags); - if (param & SCAN_PAGE) { + if (param & SCAN_PAGE) set_bit(HCI_PSCAN, &hdev->flags); - if (!old_pscan) - mgmt_connectable(hdev, 1); - } else if (old_pscan) - mgmt_connectable(hdev, 0); + else + clear_bit(HCI_PSCAN, &hdev->flags); done: hci_dev_unlock(hdev); @@ -601,8 +589,10 @@ static void hci_cc_read_flow_control_mode(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (!rp->status) - hdev->flow_ctl_mode = rp->mode; + if (rp->status) + return; + + hdev->flow_ctl_mode = rp->mode; } static void hci_cc_read_buffer_size(struct hci_dev *hdev, struct sk_buff *skb) @@ -637,8 +627,14 @@ static void hci_cc_read_bd_addr(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (!rp->status) + if (rp->status) + return; + + if (test_bit(HCI_INIT, &hdev->flags)) bacpy(&hdev->bdaddr, &rp->bdaddr); + + if (test_bit(HCI_SETUP, &hdev->dev_flags)) + bacpy(&hdev->setup_addr, &rp->bdaddr); } static void hci_cc_read_page_scan_activity(struct hci_dev *hdev, @@ -648,7 +644,10 @@ static void hci_cc_read_page_scan_activity(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (test_bit(HCI_INIT, &hdev->flags) && !rp->status) { + if (rp->status) + return; + + if (test_bit(HCI_INIT, &hdev->flags)) { hdev->page_scan_interval = __le16_to_cpu(rp->interval); hdev->page_scan_window = __le16_to_cpu(rp->window); } @@ -680,7 +679,10 @@ static void hci_cc_read_page_scan_type(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (test_bit(HCI_INIT, &hdev->flags) && !rp->status) + if (rp->status) + return; + + if (test_bit(HCI_INIT, &hdev->flags)) hdev->page_scan_type = rp->type; } @@ -720,6 +722,41 @@ static void hci_cc_read_data_block_size(struct hci_dev *hdev, hdev->block_cnt, hdev->block_len); } +static void hci_cc_read_clock(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_read_clock *rp = (void *) skb->data; + struct hci_cp_read_clock *cp; + struct hci_conn *conn; + + BT_DBG("%s", hdev->name); + + if (skb->len < sizeof(*rp)) + return; + + if (rp->status) + return; + + hci_dev_lock(hdev); + + cp = hci_sent_cmd_data(hdev, HCI_OP_READ_CLOCK); + if (!cp) + goto unlock; + + if (cp->which == 0x00) { + hdev->clock = le32_to_cpu(rp->clock); + goto unlock; + } + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); + if (conn) { + conn->clock = le32_to_cpu(rp->clock); + conn->clock_accuracy = le16_to_cpu(rp->accuracy); + } + +unlock: + hci_dev_unlock(hdev); +} + static void hci_cc_read_local_amp_info(struct hci_dev *hdev, struct sk_buff *skb) { @@ -789,8 +826,10 @@ static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (!rp->status) - hdev->inq_tx_power = rp->tx_power; + if (rp->status) + return; + + hdev->inq_tx_power = rp->tx_power; } static void hci_cc_pin_code_reply(struct hci_dev *hdev, struct sk_buff *skb) @@ -861,8 +900,10 @@ static void hci_cc_le_read_local_features(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (!rp->status) - memcpy(hdev->le_features, rp->features, 8); + if (rp->status) + return; + + memcpy(hdev->le_features, rp->features, 8); } static void hci_cc_le_read_adv_tx_power(struct hci_dev *hdev, @@ -872,8 +913,10 @@ static void hci_cc_le_read_adv_tx_power(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (!rp->status) - hdev->adv_tx_power = rp->tx_power; + if (rp->status) + return; + + hdev->adv_tx_power = rp->tx_power; } static void hci_cc_user_confirm_reply(struct hci_dev *hdev, struct sk_buff *skb) @@ -973,14 +1016,16 @@ static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, status); + if (status) + return; + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_RANDOM_ADDR); if (!sent) return; hci_dev_lock(hdev); - if (!status) - bacpy(&hdev->random_addr, sent); + bacpy(&hdev->random_addr, sent); hci_dev_unlock(hdev); } @@ -991,11 +1036,11 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, status); - sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_ENABLE); - if (!sent) + if (status) return; - if (status) + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_ENABLE); + if (!sent) return; hci_dev_lock(hdev); @@ -1006,15 +1051,17 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) if (*sent) { struct hci_conn *conn; + set_bit(HCI_LE_ADV, &hdev->dev_flags); + conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); if (conn) queue_delayed_work(hdev->workqueue, &conn->le_conn_timeout, - HCI_LE_CONN_TIMEOUT); + conn->conn_timeout); + } else { + clear_bit(HCI_LE_ADV, &hdev->dev_flags); } - mgmt_advertising(hdev, *sent); - hci_dev_unlock(hdev); } @@ -1025,14 +1072,16 @@ static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, status); + if (status) + return; + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_PARAM); if (!cp) return; hci_dev_lock(hdev); - if (!status) - hdev->le_scan_type = cp->type; + hdev->le_scan_type = cp->type; hci_dev_unlock(hdev); } @@ -1053,13 +1102,15 @@ static void clear_pending_adv_report(struct hci_dev *hdev) } static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr, - u8 bdaddr_type, s8 rssi, u8 *data, u8 len) + u8 bdaddr_type, s8 rssi, u32 flags, + u8 *data, u8 len) { struct discovery_state *d = &hdev->discovery; bacpy(&d->last_adv_addr, bdaddr); d->last_adv_addr_type = bdaddr_type; d->last_adv_rssi = rssi; + d->last_adv_flags = flags; memcpy(d->last_adv_data, data, len); d->last_adv_data_len = len; } @@ -1072,11 +1123,11 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, status); - cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE); - if (!cp) + if (status) return; - if (status) + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE); + if (!cp) return; switch (cp->enable) { @@ -1096,7 +1147,7 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK, d->last_adv_addr_type, NULL, - d->last_adv_rssi, 0, 1, + d->last_adv_rssi, d->last_adv_flags, d->last_adv_data, d->last_adv_data_len, NULL, 0); } @@ -1107,13 +1158,21 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, cancel_delayed_work(&hdev->le_scan_disable); clear_bit(HCI_LE_SCAN, &hdev->dev_flags); + /* The HCI_LE_SCAN_INTERRUPTED flag indicates that we * interrupted scanning due to a connect request. Mark - * therefore discovery as stopped. + * therefore discovery as stopped. If this was not + * because of a connect request advertising might have + * been disabled because of active scanning, so + * re-enable it again if necessary. */ if (test_and_clear_bit(HCI_LE_SCAN_INTERRUPTED, &hdev->dev_flags)) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + else if (!test_bit(HCI_LE_ADV, &hdev->dev_flags) && + hdev->discovery.state == DISCOVERY_FINDING) + mgmt_reenable_advertising(hdev); + break; default: @@ -1129,8 +1188,10 @@ static void hci_cc_le_read_white_list_size(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size); - if (!rp->status) - hdev->le_white_list_size = rp->size; + if (rp->status) + return; + + hdev->le_white_list_size = rp->size; } static void hci_cc_le_clear_white_list(struct hci_dev *hdev, @@ -1140,8 +1201,10 @@ static void hci_cc_le_clear_white_list(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, status); - if (!status) - hci_white_list_clear(hdev); + if (status) + return; + + hci_bdaddr_list_clear(&hdev->le_white_list); } static void hci_cc_le_add_to_white_list(struct hci_dev *hdev, @@ -1152,12 +1215,15 @@ static void hci_cc_le_add_to_white_list(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, status); + if (status) + return; + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_WHITE_LIST); if (!sent) return; - if (!status) - hci_white_list_add(hdev, &sent->bdaddr, sent->bdaddr_type); + hci_bdaddr_list_add(&hdev->le_white_list, &sent->bdaddr, + sent->bdaddr_type); } static void hci_cc_le_del_from_white_list(struct hci_dev *hdev, @@ -1168,12 +1234,15 @@ static void hci_cc_le_del_from_white_list(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, status); + if (status) + return; + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_WHITE_LIST); if (!sent) return; - if (!status) - hci_white_list_del(hdev, &sent->bdaddr, sent->bdaddr_type); + hci_bdaddr_list_del(&hdev->le_white_list, &sent->bdaddr, + sent->bdaddr_type); } static void hci_cc_le_read_supported_states(struct hci_dev *hdev, @@ -1183,8 +1252,10 @@ static void hci_cc_le_read_supported_states(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - if (!rp->status) - memcpy(hdev->le_states, rp->le_states, 8); + if (rp->status) + return; + + memcpy(hdev->le_states, rp->le_states, 8); } static void hci_cc_write_le_host_supported(struct hci_dev *hdev, @@ -1195,25 +1266,26 @@ static void hci_cc_write_le_host_supported(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, status); + if (status) + return; + sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED); if (!sent) return; - if (!status) { - if (sent->le) { - hdev->features[1][0] |= LMP_HOST_LE; - set_bit(HCI_LE_ENABLED, &hdev->dev_flags); - } else { - hdev->features[1][0] &= ~LMP_HOST_LE; - clear_bit(HCI_LE_ENABLED, &hdev->dev_flags); - clear_bit(HCI_ADVERTISING, &hdev->dev_flags); - } - - if (sent->simul) - hdev->features[1][0] |= LMP_HOST_LE_BREDR; - else - hdev->features[1][0] &= ~LMP_HOST_LE_BREDR; + if (sent->le) { + hdev->features[1][0] |= LMP_HOST_LE; + set_bit(HCI_LE_ENABLED, &hdev->dev_flags); + } else { + hdev->features[1][0] &= ~LMP_HOST_LE; + clear_bit(HCI_LE_ENABLED, &hdev->dev_flags); + clear_bit(HCI_ADVERTISING, &hdev->dev_flags); } + + if (sent->simul) + hdev->features[1][0] |= LMP_HOST_LE_BREDR; + else + hdev->features[1][0] &= ~LMP_HOST_LE_BREDR; } static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb) @@ -1342,11 +1414,9 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status) } } else { if (!conn) { - conn = hci_conn_add(hdev, ACL_LINK, &cp->bdaddr); - if (conn) { - conn->out = true; - conn->link_mode |= HCI_LM_MASTER; - } else + conn = hci_conn_add(hdev, ACL_LINK, &cp->bdaddr, + HCI_ROLE_MASTER); + if (!conn) BT_ERR("No memory for new connection"); } } @@ -1575,6 +1645,8 @@ static void hci_cs_remote_name_req(struct hci_dev *hdev, __u8 status) if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) { struct hci_cp_auth_requested auth_cp; + set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags); + auth_cp.handle = __cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, sizeof(auth_cp), &auth_cp); @@ -1835,7 +1907,7 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) if (cp->filter_policy == HCI_LE_USE_PEER_ADDR) queue_delayed_work(conn->hdev->workqueue, &conn->le_conn_timeout, - HCI_LE_CONN_TIMEOUT); + conn->conn_timeout); unlock: hci_dev_unlock(hdev); @@ -1929,7 +2001,7 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); for (; num_rsp; num_rsp--, info++) { - bool name_known, ssp; + u32 flags; bacpy(&data.bdaddr, &info->bdaddr); data.pscan_rep_mode = info->pscan_rep_mode; @@ -1940,10 +2012,10 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) data.rssi = 0x00; data.ssp_mode = 0x00; - name_known = hci_inquiry_cache_update(hdev, &data, false, &ssp); + flags = hci_inquiry_cache_update(hdev, &data, false); + mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, - info->dev_class, 0, !name_known, ssp, NULL, - 0, NULL, 0); + info->dev_class, 0, flags, NULL, 0, NULL, 0); } hci_dev_unlock(hdev); @@ -1988,10 +2060,10 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_conn_add_sysfs(conn); if (test_bit(HCI_AUTH, &hdev->flags)) - conn->link_mode |= HCI_LM_AUTH; + set_bit(HCI_CONN_AUTH, &conn->flags); if (test_bit(HCI_ENCRYPT, &hdev->flags)) - conn->link_mode |= HCI_LM_ENCRYPT; + set_bit(HCI_CONN_ENCRYPT, &conn->flags); /* Get remote features */ if (conn->type == ACL_LINK) { @@ -1999,6 +2071,8 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) cp.handle = ev->handle; hci_send_cmd(hdev, HCI_OP_READ_REMOTE_FEATURES, sizeof(cp), &cp); + + hci_update_page_scan(hdev, NULL); } /* Set packet type for incoming connection */ @@ -2031,10 +2105,21 @@ unlock: hci_conn_check_pending(hdev); } +static void hci_reject_conn(struct hci_dev *hdev, bdaddr_t *bdaddr) +{ + struct hci_cp_reject_conn_req cp; + + bacpy(&cp.bdaddr, bdaddr); + cp.reason = HCI_ERROR_REJ_BAD_ADDR; + hci_send_cmd(hdev, HCI_OP_REJECT_CONN_REQ, sizeof(cp), &cp); +} + static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_conn_request *ev = (void *) skb->data; int mask = hdev->link_mode; + struct inquiry_entry *ie; + struct hci_conn *conn; __u8 flags = 0; BT_DBG("%s bdaddr %pMR type 0x%x", hdev->name, &ev->bdaddr, @@ -2043,73 +2128,79 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ev->link_type, &flags); - if ((mask & HCI_LM_ACCEPT) && - !hci_blacklist_lookup(hdev, &ev->bdaddr, BDADDR_BREDR)) { - /* Connection accepted */ - struct inquiry_entry *ie; - struct hci_conn *conn; + if (!(mask & HCI_LM_ACCEPT)) { + hci_reject_conn(hdev, &ev->bdaddr); + return; + } - hci_dev_lock(hdev); + if (hci_bdaddr_list_lookup(&hdev->blacklist, &ev->bdaddr, + BDADDR_BREDR)) { + hci_reject_conn(hdev, &ev->bdaddr); + return; + } - ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr); - if (ie) - memcpy(ie->data.dev_class, ev->dev_class, 3); + if (!test_bit(HCI_CONNECTABLE, &hdev->dev_flags) && + !hci_bdaddr_list_lookup(&hdev->whitelist, &ev->bdaddr, + BDADDR_BREDR)) { + hci_reject_conn(hdev, &ev->bdaddr); + return; + } + + /* Connection accepted */ + + hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, - &ev->bdaddr); + ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr); + if (ie) + memcpy(ie->data.dev_class, ev->dev_class, 3); + + conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, + &ev->bdaddr); + if (!conn) { + conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr, + HCI_ROLE_SLAVE); if (!conn) { - conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr); - if (!conn) { - BT_ERR("No memory for new connection"); - hci_dev_unlock(hdev); - return; - } + BT_ERR("No memory for new connection"); + hci_dev_unlock(hdev); + return; } + } - memcpy(conn->dev_class, ev->dev_class, 3); + memcpy(conn->dev_class, ev->dev_class, 3); - hci_dev_unlock(hdev); + hci_dev_unlock(hdev); - if (ev->link_type == ACL_LINK || - (!(flags & HCI_PROTO_DEFER) && !lmp_esco_capable(hdev))) { - struct hci_cp_accept_conn_req cp; - conn->state = BT_CONNECT; + if (ev->link_type == ACL_LINK || + (!(flags & HCI_PROTO_DEFER) && !lmp_esco_capable(hdev))) { + struct hci_cp_accept_conn_req cp; + conn->state = BT_CONNECT; - bacpy(&cp.bdaddr, &ev->bdaddr); + bacpy(&cp.bdaddr, &ev->bdaddr); - if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER)) - cp.role = 0x00; /* Become master */ - else - cp.role = 0x01; /* Remain slave */ + if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER)) + cp.role = 0x00; /* Become master */ + else + cp.role = 0x01; /* Remain slave */ - hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), - &cp); - } else if (!(flags & HCI_PROTO_DEFER)) { - struct hci_cp_accept_sync_conn_req cp; - conn->state = BT_CONNECT; + hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp); + } else if (!(flags & HCI_PROTO_DEFER)) { + struct hci_cp_accept_sync_conn_req cp; + conn->state = BT_CONNECT; - bacpy(&cp.bdaddr, &ev->bdaddr); - cp.pkt_type = cpu_to_le16(conn->pkt_type); + bacpy(&cp.bdaddr, &ev->bdaddr); + cp.pkt_type = cpu_to_le16(conn->pkt_type); - cp.tx_bandwidth = cpu_to_le32(0x00001f40); - cp.rx_bandwidth = cpu_to_le32(0x00001f40); - cp.max_latency = cpu_to_le16(0xffff); - cp.content_format = cpu_to_le16(hdev->voice_setting); - cp.retrans_effort = 0xff; + cp.tx_bandwidth = cpu_to_le32(0x00001f40); + cp.rx_bandwidth = cpu_to_le32(0x00001f40); + cp.max_latency = cpu_to_le16(0xffff); + cp.content_format = cpu_to_le16(hdev->voice_setting); + cp.retrans_effort = 0xff; - hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, - sizeof(cp), &cp); - } else { - conn->state = BT_CONNECT2; - hci_proto_connect_cfm(conn, 0); - } + hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, sizeof(cp), + &cp); } else { - /* Connection rejected */ - struct hci_cp_reject_conn_req cp; - - bacpy(&cp.bdaddr, &ev->bdaddr); - cp.reason = HCI_ERROR_REJ_BAD_ADDR; - hci_send_cmd(hdev, HCI_OP_REJECT_CONN_REQ, sizeof(cp), &cp); + conn->state = BT_CONNECT2; + hci_proto_connect_cfm(conn, 0); } } @@ -2158,8 +2249,12 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type, reason, mgmt_connected); - if (conn->type == ACL_LINK && conn->flush_key) - hci_remove_link_key(hdev, &conn->dst); + if (conn->type == ACL_LINK) { + if (test_bit(HCI_CONN_FLUSH_KEY, &conn->flags)) + hci_remove_link_key(hdev, &conn->dst); + + hci_update_page_scan(hdev, NULL); + } params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); if (params) { @@ -2169,8 +2264,11 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) break; /* Fall through */ + case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: - hci_pend_le_conn_add(hdev, &conn->dst, conn->dst_type); + list_del_init(¶ms->action); + list_add(¶ms->action, &hdev->pend_le_conns); + hci_update_background_scan(hdev); break; default: @@ -2218,12 +2316,11 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) test_bit(HCI_CONN_REAUTH_PEND, &conn->flags)) { BT_INFO("re-auth of legacy device is not possible."); } else { - conn->link_mode |= HCI_LM_AUTH; + set_bit(HCI_CONN_AUTH, &conn->flags); conn->sec_level = conn->pending_sec_level; } } else { - mgmt_auth_failed(hdev, &conn->dst, conn->type, conn->dst_type, - ev->status); + mgmt_auth_failed(conn, ev->status); } clear_bit(HCI_CONN_AUTH_PEND, &conn->flags); @@ -2297,6 +2394,9 @@ check_auth: if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) { struct hci_cp_auth_requested cp; + + set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags); + cp.handle = __cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); } @@ -2321,23 +2421,29 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!ev->status) { if (ev->encrypt) { /* Encryption implies authentication */ - conn->link_mode |= HCI_LM_AUTH; - conn->link_mode |= HCI_LM_ENCRYPT; + set_bit(HCI_CONN_AUTH, &conn->flags); + set_bit(HCI_CONN_ENCRYPT, &conn->flags); conn->sec_level = conn->pending_sec_level; /* P-256 authentication key implies FIPS */ if (conn->key_type == HCI_LK_AUTH_COMBINATION_P256) - conn->link_mode |= HCI_LM_FIPS; + set_bit(HCI_CONN_FIPS, &conn->flags); if ((conn->type == ACL_LINK && ev->encrypt == 0x02) || conn->type == LE_LINK) set_bit(HCI_CONN_AES_CCM, &conn->flags); } else { - conn->link_mode &= ~HCI_LM_ENCRYPT; + clear_bit(HCI_CONN_ENCRYPT, &conn->flags); clear_bit(HCI_CONN_AES_CCM, &conn->flags); } } + /* We should disregard the current RPA and generate a new one + * whenever the encryption procedure fails. + */ + if (ev->status && conn->type == LE_LINK) + set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); + clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); if (ev->status && conn->state == BT_CONNECTED) { @@ -2384,7 +2490,7 @@ static void hci_change_link_key_complete_evt(struct hci_dev *hdev, conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); if (conn) { if (!ev->status) - conn->link_mode |= HCI_LM_SECURE; + set_bit(HCI_CONN_SECURE, &conn->flags); clear_bit(HCI_CONN_AUTH_PEND, &conn->flags); @@ -2595,6 +2701,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_cc_read_local_amp_info(hdev, skb); break; + case HCI_OP_READ_CLOCK: + hci_cc_read_clock(hdev, skb); + break; + case HCI_OP_READ_LOCAL_AMP_ASSOC: hci_cc_read_local_amp_assoc(hdev, skb); break; @@ -2709,7 +2819,7 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) } if (opcode != HCI_OP_NOP) - del_timer(&hdev->cmd_timer); + cancel_delayed_work(&hdev->cmd_timer); hci_req_cmd_complete(hdev, opcode, status); @@ -2800,7 +2910,7 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb) } if (opcode != HCI_OP_NOP) - del_timer(&hdev->cmd_timer); + cancel_delayed_work(&hdev->cmd_timer); if (ev->status || (hdev->sent_cmd && !bt_cb(hdev->sent_cmd)->req.event)) @@ -2824,12 +2934,8 @@ static void hci_role_change_evt(struct hci_dev *hdev, struct sk_buff *skb) conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); if (conn) { - if (!ev->status) { - if (ev->role) - conn->link_mode &= ~HCI_LM_MASTER; - else - conn->link_mode |= HCI_LM_MASTER; - } + if (!ev->status) + conn->role = ev->role; clear_bit(HCI_CONN_RSWITCH_PEND, &conn->flags); @@ -3023,10 +3129,11 @@ static void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_conn_drop(conn); } - if (!test_bit(HCI_PAIRABLE, &hdev->dev_flags)) + if (!test_bit(HCI_BONDABLE, &hdev->dev_flags) && + !test_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags)) { hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY, sizeof(ev->bdaddr), &ev->bdaddr); - else if (test_bit(HCI_MGMT, &hdev->dev_flags)) { + } else if (test_bit(HCI_MGMT, &hdev->dev_flags)) { u8 secure; if (conn->pending_sec_level == BT_SECURITY_HIGH) @@ -3065,12 +3172,6 @@ static void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s found key type %u for %pMR", hdev->name, key->type, &ev->bdaddr); - if (!test_bit(HCI_DEBUG_KEYS, &hdev->dev_flags) && - key->type == HCI_LK_DEBUG_COMBINATION) { - BT_DBG("%s ignoring debug key", hdev->name); - goto not_found; - } - conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); if (conn) { if ((key->type == HCI_LK_UNAUTH_COMBINATION_P192 || @@ -3110,6 +3211,8 @@ static void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_link_key_notify *ev = (void *) skb->data; struct hci_conn *conn; + struct link_key *key; + bool persistent; u8 pin_len = 0; BT_DBG("%s", hdev->name); @@ -3128,10 +3231,33 @@ static void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_conn_drop(conn); } - if (test_bit(HCI_MGMT, &hdev->dev_flags)) - hci_add_link_key(hdev, conn, 1, &ev->bdaddr, ev->link_key, - ev->key_type, pin_len); + if (!test_bit(HCI_MGMT, &hdev->dev_flags)) + goto unlock; + + key = hci_add_link_key(hdev, conn, &ev->bdaddr, ev->link_key, + ev->key_type, pin_len, &persistent); + if (!key) + goto unlock; + + mgmt_new_link_key(hdev, key, persistent); + /* Keep debug keys around only if the HCI_KEEP_DEBUG_KEYS flag + * is set. If it's not set simply remove the key from the kernel + * list (we've still notified user space about it but with + * store_hint being 0). + */ + if (key->type == HCI_LK_DEBUG_COMBINATION && + !test_bit(HCI_KEEP_DEBUG_KEYS, &hdev->dev_flags)) { + list_del(&key->list); + kfree(key); + } else if (conn) { + if (persistent) + clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags); + else + set_bit(HCI_CONN_FLUSH_KEY, &conn->flags); + } + +unlock: hci_dev_unlock(hdev); } @@ -3197,7 +3323,6 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, { struct inquiry_data data; int num_rsp = *((__u8 *) skb->data); - bool name_known, ssp; BT_DBG("%s num_rsp %d", hdev->name, num_rsp); @@ -3214,6 +3339,8 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, info = (void *) (skb->data + 1); for (; num_rsp; num_rsp--, info++) { + u32 flags; + bacpy(&data.bdaddr, &info->bdaddr); data.pscan_rep_mode = info->pscan_rep_mode; data.pscan_period_mode = info->pscan_period_mode; @@ -3223,16 +3350,18 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, data.rssi = info->rssi; data.ssp_mode = 0x00; - name_known = hci_inquiry_cache_update(hdev, &data, - false, &ssp); + flags = hci_inquiry_cache_update(hdev, &data, false); + mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, info->dev_class, info->rssi, - !name_known, ssp, NULL, 0, NULL, 0); + flags, NULL, 0, NULL, 0); } } else { struct inquiry_info_with_rssi *info = (void *) (skb->data + 1); for (; num_rsp; num_rsp--, info++) { + u32 flags; + bacpy(&data.bdaddr, &info->bdaddr); data.pscan_rep_mode = info->pscan_rep_mode; data.pscan_period_mode = info->pscan_period_mode; @@ -3241,11 +3370,12 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, data.clock_offset = info->clock_offset; data.rssi = info->rssi; data.ssp_mode = 0x00; - name_known = hci_inquiry_cache_update(hdev, &data, - false, &ssp); + + flags = hci_inquiry_cache_update(hdev, &data, false); + mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, info->dev_class, info->rssi, - !name_known, ssp, NULL, 0, NULL, 0); + flags, NULL, 0, NULL, 0); } } @@ -3348,6 +3478,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, hci_conn_add_sysfs(conn); break; + case 0x10: /* Connection Accept Timeout */ case 0x0d: /* Connection Rejected due to Limited Resources */ case 0x11: /* Unsupported Feature or Parameter Value */ case 0x1c: /* SCO interval rejected */ @@ -3411,7 +3542,8 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, hci_dev_lock(hdev); for (; num_rsp; num_rsp--, info++) { - bool name_known, ssp; + u32 flags; + bool name_known; bacpy(&data.bdaddr, &info->bdaddr); data.pscan_rep_mode = info->pscan_rep_mode; @@ -3429,12 +3561,13 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, else name_known = true; - name_known = hci_inquiry_cache_update(hdev, &data, name_known, - &ssp); + flags = hci_inquiry_cache_update(hdev, &data, name_known); + eir_len = eir_get_length(info->data, sizeof(info->data)); + mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, - info->dev_class, info->rssi, !name_known, - ssp, info->data, eir_len, NULL, 0); + info->dev_class, info->rssi, + flags, info->data, eir_len, NULL, 0); } hci_dev_unlock(hdev); @@ -3526,7 +3659,11 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!test_bit(HCI_MGMT, &hdev->dev_flags)) goto unlock; - if (test_bit(HCI_PAIRABLE, &hdev->dev_flags) || + /* Allow pairing if we're pairable, the initiators of the + * pairing or if the remote is not requesting bonding. + */ + if (test_bit(HCI_BONDABLE, &hdev->dev_flags) || + test_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags) || (conn->remote_auth & ~0x01) == HCI_AT_NO_BONDING) { struct hci_cp_io_capability_reply cp; @@ -3538,23 +3675,24 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb) /* If we are initiators, there is no remote information yet */ if (conn->remote_auth == 0xff) { - cp.authentication = conn->auth_type; - /* Request MITM protection if our IO caps allow it * except for the no-bonding case. - * conn->auth_type is not updated here since - * that might cause the user confirmation to be - * rejected in case the remote doesn't have the - * IO capabilities for MITM. */ if (conn->io_capability != HCI_IO_NO_INPUT_OUTPUT && - cp.authentication != HCI_AT_NO_BONDING) - cp.authentication |= 0x01; + conn->auth_type != HCI_AT_NO_BONDING) + conn->auth_type |= 0x01; } else { conn->auth_type = hci_get_auth_req(conn); - cp.authentication = conn->auth_type; } + /* If we're not bondable, force one of the non-bondable + * authentication requirement values. + */ + if (!test_bit(HCI_BONDABLE, &hdev->dev_flags)) + conn->auth_type &= HCI_AT_NO_BONDING_MITM; + + cp.authentication = conn->auth_type; + if (hci_find_remote_oob_data(hdev, &conn->dst) && (conn->out || test_bit(HCI_CONN_REMOTE_OOB, &conn->flags))) cp.oob_data = 0x01; @@ -3621,9 +3759,12 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev, rem_mitm = (conn->remote_auth & 0x01); /* If we require MITM but the remote device can't provide that - * (it has NoInputNoOutput) then reject the confirmation request + * (it has NoInputNoOutput) then reject the confirmation + * request. We check the security level here since it doesn't + * necessarily match conn->auth_type. */ - if (loc_mitm && conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) { + if (conn->pending_sec_level > BT_SECURITY_MEDIUM && + conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) { BT_DBG("Rejecting request: remote device can't provide MITM"); hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_NEG_REPLY, sizeof(ev->bdaddr), &ev->bdaddr); @@ -3637,9 +3778,11 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev, /* If we're not the initiators request authorization to * proceed from user space (mgmt_user_confirm with * confirm_hint set to 1). The exception is if neither - * side had MITM in which case we do auto-accept. + * side had MITM or if the local IO capability is + * NoInputNoOutput, in which case we do auto-accept */ if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) && + conn->io_capability != HCI_IO_NO_INPUT_OUTPUT && (loc_mitm || rem_mitm)) { BT_DBG("Confirming auto-accept as acceptor"); confirm_hint = 1; @@ -3753,14 +3896,16 @@ static void hci_simple_pair_complete_evt(struct hci_dev *hdev, if (!conn) goto unlock; + /* Reset the authentication requirement to unknown */ + conn->remote_auth = 0xff; + /* To avoid duplicate auth_failed events to user space we check * the HCI_CONN_AUTH_PEND flag which will be set if we * initiated the authentication. A traditional auth_complete * event gets always produced as initiator and is also mapped to * the mgmt_auth_failed event */ if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) && ev->status) - mgmt_auth_failed(hdev, &conn->dst, conn->type, conn->dst_type, - ev->status); + mgmt_auth_failed(conn, ev->status); hci_conn_drop(conn); @@ -3967,16 +4112,23 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_le_conn_complete *ev = (void *) skb->data; + struct hci_conn_params *params; struct hci_conn *conn; struct smp_irk *irk; + u8 addr_type; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); hci_dev_lock(hdev); + /* All controllers implicitly stop advertising in the event of a + * connection, so ensure that the state bit is cleared. + */ + clear_bit(HCI_LE_ADV, &hdev->dev_flags); + conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); if (!conn) { - conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr); + conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role); if (!conn) { BT_ERR("No memory for new connection"); goto unlock; @@ -3984,11 +4136,6 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->dst_type = ev->bdaddr_type; - if (ev->role == LE_CONN_ROLE_MASTER) { - conn->out = true; - conn->link_mode |= HCI_LM_MASTER; - } - /* If we didn't have a hci_conn object previously * but we're in master role this must be something * initiated using a white list. Since white list based @@ -4025,6 +4172,14 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->init_addr_type = ev->bdaddr_type; bacpy(&conn->init_addr, &ev->bdaddr); + + /* For incoming connections, set the default minimum + * and maximum connection interval. They will be used + * to check if the parameters are in range and if not + * trigger the connection update procedure. + */ + conn->le_conn_min_interval = hdev->le_conn_min_interval; + conn->le_conn_max_interval = hdev->le_conn_max_interval; } /* Lookup the identity address from the stored connection @@ -4047,6 +4202,17 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) goto unlock; } + if (conn->dst_type == ADDR_LE_DEV_PUBLIC) + addr_type = BDADDR_LE_PUBLIC; + else + addr_type = BDADDR_LE_RANDOM; + + /* Drop the connection if the device is blocked */ + if (hci_bdaddr_list_lookup(&hdev->blacklist, &conn->dst, addr_type)) { + hci_conn_drop(conn); + goto unlock; + } + if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) mgmt_device_connected(hdev, &conn->dst, conn->type, conn->dst_type, 0, NULL, 0, NULL); @@ -4055,42 +4221,115 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; - if (test_bit(HCI_6LOWPAN_ENABLED, &hdev->dev_flags)) - set_bit(HCI_CONN_6LOWPAN, &conn->flags); + conn->le_conn_interval = le16_to_cpu(ev->interval); + conn->le_conn_latency = le16_to_cpu(ev->latency); + conn->le_supv_timeout = le16_to_cpu(ev->supervision_timeout); hci_conn_add_sysfs(conn); hci_proto_connect_cfm(conn, ev->status); - hci_pend_le_conn_del(hdev, &conn->dst, conn->dst_type); + params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, + conn->dst_type); + if (params) { + list_del_init(¶ms->action); + if (params->conn) { + hci_conn_drop(params->conn); + hci_conn_put(params->conn); + params->conn = NULL; + } + } unlock: + hci_update_background_scan(hdev); + hci_dev_unlock(hdev); +} + +static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_ev_le_conn_update_complete *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); + + if (ev->status) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (conn) { + conn->le_conn_interval = le16_to_cpu(ev->interval); + conn->le_conn_latency = le16_to_cpu(ev->latency); + conn->le_supv_timeout = le16_to_cpu(ev->supervision_timeout); + } + hci_dev_unlock(hdev); } /* This function requires the caller holds hdev->lock */ static void check_pending_le_conn(struct hci_dev *hdev, bdaddr_t *addr, - u8 addr_type) + u8 addr_type, u8 adv_type) { struct hci_conn *conn; - struct smp_irk *irk; + struct hci_conn_params *params; + + /* If the event is not connectable don't proceed further */ + if (adv_type != LE_ADV_IND && adv_type != LE_ADV_DIRECT_IND) + return; + + /* Ignore if the device is blocked */ + if (hci_bdaddr_list_lookup(&hdev->blacklist, addr, addr_type)) + return; - /* If this is a resolvable address, we should resolve it and then - * update address and address type variables. + /* Most controller will fail if we try to create new connections + * while we have an existing one in slave role. */ - irk = hci_get_irk(hdev, addr, addr_type); - if (irk) { - addr = &irk->bdaddr; - addr_type = irk->addr_type; - } + if (hdev->conn_hash.le_num_slave > 0) + return; - if (!hci_pend_le_conn_lookup(hdev, addr, addr_type)) + /* If we're not connectable only connect devices that we have in + * our pend_le_conns list. + */ + params = hci_pend_le_action_lookup(&hdev->pend_le_conns, + addr, addr_type); + if (!params) return; + switch (params->auto_connect) { + case HCI_AUTO_CONN_DIRECT: + /* Only devices advertising with ADV_DIRECT_IND are + * triggering a connection attempt. This is allowing + * incoming connections from slave devices. + */ + if (adv_type != LE_ADV_DIRECT_IND) + return; + break; + case HCI_AUTO_CONN_ALWAYS: + /* Devices advertising with ADV_IND or ADV_DIRECT_IND + * are triggering a connection attempt. This means + * that incoming connectioms from slave device are + * accepted and also outgoing connections to slave + * devices are established when found. + */ + break; + default: + return; + } + conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW, - HCI_AT_NO_BONDING); - if (!IS_ERR(conn)) + HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER); + if (!IS_ERR(conn)) { + /* Store the pointer since we don't really have any + * other owner of the object besides the params that + * triggered it. This way we can abort the connection if + * the parameters get removed and keep the reference + * count consistent once the connection is established. + */ + params->conn = hci_conn_get(conn); return; + } switch (PTR_ERR(conn)) { case -EBUSY: @@ -4109,15 +4348,62 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, u8 bdaddr_type, s8 rssi, u8 *data, u8 len) { struct discovery_state *d = &hdev->discovery; + struct smp_irk *irk; bool match; + u32 flags; + + /* Check if we need to convert to identity address */ + irk = hci_get_irk(hdev, bdaddr, bdaddr_type); + if (irk) { + bdaddr = &irk->bdaddr; + bdaddr_type = irk->addr_type; + } - /* Passive scanning shouldn't trigger any device found events */ + /* Check if we have been requested to connect to this device */ + check_pending_le_conn(hdev, bdaddr, bdaddr_type, type); + + /* Passive scanning shouldn't trigger any device found events, + * except for devices marked as CONN_REPORT for which we do send + * device found events. + */ if (hdev->le_scan_type == LE_SCAN_PASSIVE) { - if (type == LE_ADV_IND || type == LE_ADV_DIRECT_IND) - check_pending_le_conn(hdev, bdaddr, bdaddr_type); + if (type == LE_ADV_DIRECT_IND) + return; + + if (!hci_pend_le_action_lookup(&hdev->pend_le_reports, + bdaddr, bdaddr_type)) + return; + + if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND) + flags = MGMT_DEV_FOUND_NOT_CONNECTABLE; + else + flags = 0; + mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL, + rssi, flags, data, len, NULL, 0); return; } + /* When receiving non-connectable or scannable undirected + * advertising reports, this means that the remote device is + * not connectable and then clearly indicate this in the + * device found event. + * + * When receiving a scan response, then there is no way to + * know if the remote device is connectable or not. However + * since scan responses are merged with a previously seen + * advertising report, the flags field from that report + * will be used. + * + * In the really unlikely case that a controller get confused + * and just sends a scan response event, then it is marked as + * not connectable as well. + */ + if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND || + type == LE_ADV_SCAN_RSP) + flags = MGMT_DEV_FOUND_NOT_CONNECTABLE; + else + flags = 0; + /* If there's nothing pending either store the data from this * event or send an immediate device found event if the data * should not be stored for later. @@ -4128,12 +4414,12 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, */ if (type == LE_ADV_IND || type == LE_ADV_SCAN_IND) { store_pending_adv_report(hdev, bdaddr, bdaddr_type, - rssi, data, len); + rssi, flags, data, len); return; } mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL, - rssi, 0, 1, data, len, NULL, 0); + rssi, flags, data, len, NULL, 0); return; } @@ -4150,7 +4436,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, if (!match) mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK, d->last_adv_addr_type, NULL, - d->last_adv_rssi, 0, 1, + d->last_adv_rssi, d->last_adv_flags, d->last_adv_data, d->last_adv_data_len, NULL, 0); @@ -4159,7 +4445,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, */ if (type == LE_ADV_IND || type == LE_ADV_SCAN_IND) { store_pending_adv_report(hdev, bdaddr, bdaddr_type, - rssi, data, len); + rssi, flags, data, len); return; } @@ -4168,7 +4454,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, */ clear_pending_adv_report(hdev); mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL, - rssi, 0, 1, data, len, NULL, 0); + rssi, flags, data, len, NULL, 0); return; } @@ -4177,8 +4463,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, * sending a merged device found event. */ mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK, - d->last_adv_addr_type, NULL, rssi, 0, 1, data, len, - d->last_adv_data, d->last_adv_data_len); + d->last_adv_addr_type, NULL, rssi, d->last_adv_flags, + d->last_adv_data, d->last_adv_data_len, data, len); clear_pending_adv_report(hdev); } @@ -4219,17 +4505,14 @@ static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb) if (conn == NULL) goto not_found; - ltk = hci_find_ltk(hdev, ev->ediv, ev->rand, conn->out); + ltk = hci_find_ltk(hdev, ev->ediv, ev->rand, conn->role); if (ltk == NULL) goto not_found; memcpy(cp.ltk, ltk->val, sizeof(ltk->val)); cp.handle = cpu_to_le16(conn->handle); - if (ltk->authenticated) - conn->pending_sec_level = BT_SECURITY_HIGH; - else - conn->pending_sec_level = BT_SECURITY_MEDIUM; + conn->pending_sec_level = smp_ltk_sec_level(ltk); conn->enc_key_size = ltk->enc_size; @@ -4241,9 +4524,12 @@ static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb) * distribute the keys. Later, security can be re-established * using a distributed LTK. */ - if (ltk->type == HCI_SMP_STK_SLAVE) { + if (ltk->type == SMP_STK) { + set_bit(HCI_CONN_STK_ENCRYPT, &conn->flags); list_del(<k->list); kfree(ltk); + } else { + clear_bit(HCI_CONN_STK_ENCRYPT, &conn->flags); } hci_dev_unlock(hdev); @@ -4256,6 +4542,76 @@ not_found: hci_dev_unlock(hdev); } +static void send_conn_param_neg_reply(struct hci_dev *hdev, u16 handle, + u8 reason) +{ + struct hci_cp_le_conn_param_req_neg_reply cp; + + cp.handle = cpu_to_le16(handle); + cp.reason = reason; + + hci_send_cmd(hdev, HCI_OP_LE_CONN_PARAM_REQ_NEG_REPLY, sizeof(cp), + &cp); +} + +static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_ev_le_remote_conn_param_req *ev = (void *) skb->data; + struct hci_cp_le_conn_param_req_reply cp; + struct hci_conn *hcon; + u16 handle, min, max, latency, timeout; + + handle = le16_to_cpu(ev->handle); + min = le16_to_cpu(ev->interval_min); + max = le16_to_cpu(ev->interval_max); + latency = le16_to_cpu(ev->latency); + timeout = le16_to_cpu(ev->timeout); + + hcon = hci_conn_hash_lookup_handle(hdev, handle); + if (!hcon || hcon->state != BT_CONNECTED) + return send_conn_param_neg_reply(hdev, handle, + HCI_ERROR_UNKNOWN_CONN_ID); + + if (hci_check_conn_params(min, max, latency, timeout)) + return send_conn_param_neg_reply(hdev, handle, + HCI_ERROR_INVALID_LL_PARAMS); + + if (hcon->role == HCI_ROLE_MASTER) { + struct hci_conn_params *params; + u8 store_hint; + + hci_dev_lock(hdev); + + params = hci_conn_params_lookup(hdev, &hcon->dst, + hcon->dst_type); + if (params) { + params->conn_min_interval = min; + params->conn_max_interval = max; + params->conn_latency = latency; + params->supervision_timeout = timeout; + store_hint = 0x01; + } else{ + store_hint = 0x00; + } + + hci_dev_unlock(hdev); + + mgmt_new_conn_param(hdev, &hcon->dst, hcon->dst_type, + store_hint, min, max, latency, timeout); + } + + cp.handle = ev->handle; + cp.interval_min = ev->interval_min; + cp.interval_max = ev->interval_max; + cp.latency = ev->latency; + cp.timeout = ev->timeout; + cp.min_ce_len = 0; + cp.max_ce_len = 0; + + hci_send_cmd(hdev, HCI_OP_LE_CONN_PARAM_REQ_REPLY, sizeof(cp), &cp); +} + static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_le_meta *le_ev = (void *) skb->data; @@ -4267,6 +4623,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_le_conn_complete_evt(hdev, skb); break; + case HCI_EV_LE_CONN_UPDATE_COMPLETE: + hci_le_conn_update_complete_evt(hdev, skb); + break; + case HCI_EV_LE_ADVERTISING_REPORT: hci_le_adv_report_evt(hdev, skb); break; @@ -4275,6 +4635,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_le_ltk_request_evt(hdev, skb); break; + case HCI_EV_LE_REMOTE_CONN_PARAM_REQ: + hci_le_remote_conn_param_req_evt(hdev, skb); + break; + default: break; } @@ -4306,7 +4670,7 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) /* Received events are (currently) only needed when a request is * ongoing so avoid unnecessary memory allocation. */ - if (hdev->req_status == HCI_REQ_PEND) { + if (hci_req_pending(hdev)) { kfree_skb(hdev->recv_evt); hdev->recv_evt = skb_clone(skb, GFP_KERNEL); } diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 80d25c150a65..115f149362ba 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -35,13 +35,32 @@ static atomic_t monitor_promisc = ATOMIC_INIT(0); /* ----- HCI socket interface ----- */ +/* Socket info */ +#define hci_pi(sk) ((struct hci_pinfo *) sk) + +struct hci_pinfo { + struct bt_sock bt; + struct hci_dev *hdev; + struct hci_filter filter; + __u32 cmsg_mask; + unsigned short channel; +}; + static inline int hci_test_bit(int nr, void *addr) { return *((__u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); } /* Security filter */ -static struct hci_sec_filter hci_sec_filter = { +#define HCI_SFLT_MAX_OGF 5 + +struct hci_sec_filter { + __u32 type_mask; + __u32 event_mask[2]; + __u32 ocf_mask[HCI_SFLT_MAX_OGF + 1][4]; +}; + +static const struct hci_sec_filter hci_sec_filter = { /* Packet types */ 0x10, /* Events */ @@ -481,7 +500,7 @@ static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) hci_dev_lock(hdev); - err = hci_blacklist_add(hdev, &bdaddr, BDADDR_BREDR); + err = hci_bdaddr_list_add(&hdev->blacklist, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); @@ -498,7 +517,7 @@ static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg) hci_dev_lock(hdev); - err = hci_blacklist_del(hdev, &bdaddr, BDADDR_BREDR); + err = hci_bdaddr_list_del(&hdev->blacklist, &bdaddr, BDADDR_BREDR); hci_dev_unlock(hdev); @@ -517,6 +536,9 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, if (test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) return -EBUSY; + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + return -EOPNOTSUPP; + if (hdev->dev_type != HCI_BREDR) return -EOPNOTSUPP; @@ -690,7 +712,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, if (test_bit(HCI_UP, &hdev->flags) || test_bit(HCI_INIT, &hdev->flags) || - test_bit(HCI_SETUP, &hdev->dev_flags)) { + test_bit(HCI_SETUP, &hdev->dev_flags) || + test_bit(HCI_CONFIG, &hdev->dev_flags)) { err = -EBUSY; hci_dev_put(hdev); goto done; @@ -960,7 +983,7 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, goto drop; } - if (test_bit(HCI_RAW, &hdev->flags) || (ogf == 0x3f)) { + if (ogf == 0x3f) { skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } else { diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 8181ea4bc2f2..1b7d605706aa 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -154,7 +154,7 @@ static int hidp_input_event(struct input_dev *dev, unsigned int type, (!!test_bit(LED_COMPOSE, dev->led) << 3) | (!!test_bit(LED_SCROLLL, dev->led) << 2) | (!!test_bit(LED_CAPSL, dev->led) << 1) | - (!!test_bit(LED_NUML, dev->led)); + (!!test_bit(LED_NUML, dev->led) << 0); if (session->leds == newleds) return 0; @@ -915,7 +915,7 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, /* connection management */ bacpy(&session->bdaddr, bdaddr); - session->conn = conn; + session->conn = l2cap_conn_get(conn); session->user.probe = hidp_session_probe; session->user.remove = hidp_session_remove; session->ctrl_sock = ctrl_sock; @@ -941,13 +941,13 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr, if (ret) goto err_free; - l2cap_conn_get(session->conn); get_file(session->intr_sock->file); get_file(session->ctrl_sock->file); *out = session; return 0; err_free: + l2cap_conn_put(session->conn); kfree(session); return ret; } @@ -1327,10 +1327,8 @@ int hidp_connection_add(struct hidp_connadd_req *req, conn = NULL; l2cap_chan_lock(chan); - if (chan->conn) { - l2cap_conn_get(chan->conn); - conn = chan->conn; - } + if (chan->conn) + conn = l2cap_conn_get(chan->conn); l2cap_chan_unlock(chan); if (!conn) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 323f23cd2c37..b6f9777e057d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -40,14 +40,13 @@ #include "smp.h" #include "a2mp.h" #include "amp.h" -#include "6lowpan.h" #define LE_FLOWCTL_MAX_CREDITS 65535 bool disable_ertm; static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD; -static u8 l2cap_fixed_chan[8] = { L2CAP_FC_L2CAP | L2CAP_FC_CONNLESS, }; +static u8 l2cap_fixed_chan[8] = { L2CAP_FC_SIG_BREDR | L2CAP_FC_CONNLESS, }; static LIST_HEAD(chan_list); static DEFINE_RWLOCK(chan_list_lock); @@ -205,11 +204,16 @@ done: write_unlock(&chan_list_lock); return err; } +EXPORT_SYMBOL_GPL(l2cap_add_psm); int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid) { write_lock(&chan_list_lock); + /* Override the defaults (which are for conn-oriented) */ + chan->omtu = L2CAP_DEFAULT_MTU; + chan->chan_type = L2CAP_CHAN_FIXED; + chan->scid = scid; write_unlock(&chan_list_lock); @@ -437,6 +441,7 @@ struct l2cap_chan *l2cap_chan_create(void) return chan; } +EXPORT_SYMBOL_GPL(l2cap_chan_create); static void l2cap_chan_destroy(struct kref *kref) { @@ -464,6 +469,7 @@ void l2cap_chan_put(struct l2cap_chan *c) kref_put(&c->kref, l2cap_chan_destroy); } +EXPORT_SYMBOL_GPL(l2cap_chan_put); void l2cap_chan_set_defaults(struct l2cap_chan *chan) { @@ -482,6 +488,7 @@ void l2cap_chan_set_defaults(struct l2cap_chan *chan) set_bit(FLAG_FORCE_ACTIVE, &chan->flags); } +EXPORT_SYMBOL_GPL(l2cap_chan_set_defaults); static void l2cap_le_flowctl_init(struct l2cap_chan *chan) { @@ -539,7 +546,10 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) l2cap_chan_hold(chan); - hci_conn_hold(conn->hcon); + /* Only keep a reference for fixed channels if they requested it */ + if (chan->chan_type != L2CAP_CHAN_FIXED || + test_bit(FLAG_HOLD_HCI_CONN, &chan->flags)) + hci_conn_hold(conn->hcon); list_add(&chan->list, &conn->chan_l); } @@ -559,6 +569,8 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) BT_DBG("chan %p, conn %p, err %d", chan, conn, err); + chan->ops->teardown(chan, err); + if (conn) { struct amp_mgr *mgr = conn->hcon->amp_mgr; /* Delete from channel list */ @@ -568,7 +580,12 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) chan->conn = NULL; - if (chan->scid != L2CAP_CID_A2MP) + /* Reference was only held for non-fixed channels or + * fixed channels that explicitly requested it using the + * FLAG_HOLD_HCI_CONN flag. + */ + if (chan->chan_type != L2CAP_CHAN_FIXED || + test_bit(FLAG_HOLD_HCI_CONN, &chan->flags)) hci_conn_drop(conn->hcon); if (mgr && mgr->bredr_chan == chan) @@ -582,8 +599,6 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) amp_disconnect_logical_link(hs_hchan); } - chan->ops->teardown(chan, err); - if (test_bit(CONF_NOT_COMPLETE, &chan->conf_state)) return; @@ -614,10 +629,13 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) return; } +EXPORT_SYMBOL_GPL(l2cap_chan_del); -void l2cap_conn_update_id_addr(struct hci_conn *hcon) +static void l2cap_conn_update_id_addr(struct work_struct *work) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_conn *conn = container_of(work, struct l2cap_conn, + id_addr_update_work); + struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan; mutex_lock(&conn->chan_lock); @@ -717,6 +735,7 @@ void l2cap_chan_close(struct l2cap_chan *chan, int reason) break; } } +EXPORT_SYMBOL(l2cap_chan_close); static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan) { @@ -770,7 +789,7 @@ static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan) } /* Service level security */ -int l2cap_chan_check_security(struct l2cap_chan *chan) +int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator) { struct l2cap_conn *conn = chan->conn; __u8 auth_type; @@ -780,7 +799,8 @@ int l2cap_chan_check_security(struct l2cap_chan *chan) auth_type = l2cap_get_auth_type(chan); - return hci_conn_security(conn->hcon, chan->sec_level, auth_type); + return hci_conn_security(conn->hcon, chan->sec_level, auth_type, + initiator); } static u8 l2cap_get_ident(struct l2cap_conn *conn) @@ -793,14 +813,14 @@ static u8 l2cap_get_ident(struct l2cap_conn *conn) * 200 - 254 are used by utilities like l2ping, etc. */ - spin_lock(&conn->lock); + mutex_lock(&conn->ident_lock); if (++conn->tx_ident > 128) conn->tx_ident = 1; id = conn->tx_ident; - spin_unlock(&conn->lock); + mutex_unlock(&conn->ident_lock); return id; } @@ -1076,6 +1096,9 @@ static void l2cap_send_rr_or_rnr(struct l2cap_chan *chan, bool poll) static inline int __l2cap_no_conn_pending(struct l2cap_chan *chan) { + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) + return true; + return !test_bit(CONF_CONNECT_PEND, &chan->conf_state); } @@ -1260,6 +1283,24 @@ static void l2cap_start_connection(struct l2cap_chan *chan) } } +static void l2cap_request_info(struct l2cap_conn *conn) +{ + struct l2cap_info_req req; + + if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) + return; + + req.type = cpu_to_le16(L2CAP_IT_FEAT_MASK); + + conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT; + conn->info_ident = l2cap_get_ident(conn); + + schedule_delayed_work(&conn->info_timer, L2CAP_INFO_TIMEOUT); + + l2cap_send_cmd(conn, conn->info_ident, L2CAP_INFO_REQ, + sizeof(req), &req); +} + static void l2cap_do_start(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; @@ -1269,26 +1310,17 @@ static void l2cap_do_start(struct l2cap_chan *chan) return; } - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) { - if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)) - return; - - if (l2cap_chan_check_security(chan) && - __l2cap_no_conn_pending(chan)) { - l2cap_start_connection(chan); - } - } else { - struct l2cap_info_req req; - req.type = cpu_to_le16(L2CAP_IT_FEAT_MASK); - - conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT; - conn->info_ident = l2cap_get_ident(conn); + if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT)) { + l2cap_request_info(conn); + return; + } - schedule_delayed_work(&conn->info_timer, L2CAP_INFO_TIMEOUT); + if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)) + return; - l2cap_send_cmd(conn, conn->info_ident, L2CAP_INFO_REQ, - sizeof(req), &req); - } + if (l2cap_chan_check_security(chan, true) && + __l2cap_no_conn_pending(chan)) + l2cap_start_connection(chan); } static inline int l2cap_mode_supported(__u8 mode, __u32 feat_mask) @@ -1347,12 +1379,13 @@ static void l2cap_conn_start(struct l2cap_conn *conn) l2cap_chan_lock(chan); if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + l2cap_chan_ready(chan); l2cap_chan_unlock(chan); continue; } if (chan->state == BT_CONNECT) { - if (!l2cap_chan_check_security(chan) || + if (!l2cap_chan_check_security(chan, true) || !__l2cap_no_conn_pending(chan)) { l2cap_chan_unlock(chan); continue; @@ -1374,7 +1407,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn) rsp.scid = cpu_to_le16(chan->dcid); rsp.dcid = cpu_to_le16(chan->scid); - if (l2cap_chan_check_security(chan)) { + if (l2cap_chan_check_security(chan, false)) { if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) { rsp.result = cpu_to_le16(L2CAP_CR_PEND); rsp.status = cpu_to_le16(L2CAP_CS_AUTHOR_PEND); @@ -1411,88 +1444,37 @@ static void l2cap_conn_start(struct l2cap_conn *conn) mutex_unlock(&conn->chan_lock); } -/* Find socket with cid and source/destination bdaddr. - * Returns closest match, locked. - */ -static struct l2cap_chan *l2cap_global_chan_by_scid(int state, u16 cid, - bdaddr_t *src, - bdaddr_t *dst) -{ - struct l2cap_chan *c, *c1 = NULL; - - read_lock(&chan_list_lock); - - list_for_each_entry(c, &chan_list, global_l) { - if (state && c->state != state) - continue; - - if (c->scid == cid) { - int src_match, dst_match; - int src_any, dst_any; - - /* Exact match. */ - src_match = !bacmp(&c->src, src); - dst_match = !bacmp(&c->dst, dst); - if (src_match && dst_match) { - read_unlock(&chan_list_lock); - return c; - } - - /* Closest match */ - src_any = !bacmp(&c->src, BDADDR_ANY); - dst_any = !bacmp(&c->dst, BDADDR_ANY); - if ((src_match && dst_any) || (src_any && dst_match) || - (src_any && dst_any)) - c1 = c; - } - } - - read_unlock(&chan_list_lock); - - return c1; -} - static void l2cap_le_conn_ready(struct l2cap_conn *conn) { struct hci_conn *hcon = conn->hcon; - struct l2cap_chan *chan, *pchan; - u8 dst_type; - - BT_DBG(""); - - bt_6lowpan_add_conn(conn); - - /* Check if we have socket listening on cid */ - pchan = l2cap_global_chan_by_scid(BT_LISTEN, L2CAP_CID_ATT, - &hcon->src, &hcon->dst); - if (!pchan) - return; - - /* Client ATT sockets should override the server one */ - if (__l2cap_get_chan_by_dcid(conn, L2CAP_CID_ATT)) - return; - - dst_type = bdaddr_type(hcon, hcon->dst_type); - - /* If device is blocked, do not create a channel for it */ - if (hci_blacklist_lookup(hcon->hdev, &hcon->dst, dst_type)) - return; + struct hci_dev *hdev = hcon->hdev; - l2cap_chan_lock(pchan); + BT_DBG("%s conn %p", hdev->name, conn); - chan = pchan->ops->new_connection(pchan); - if (!chan) - goto clean; + /* For outgoing pairing which doesn't necessarily have an + * associated socket (e.g. mgmt_pair_device). + */ + if (hcon->out) + smp_conn_security(hcon, hcon->pending_sec_level); - bacpy(&chan->src, &hcon->src); - bacpy(&chan->dst, &hcon->dst); - chan->src_type = bdaddr_type(hcon, hcon->src_type); - chan->dst_type = dst_type; + /* For LE slave connections, make sure the connection interval + * is in the range of the minium and maximum interval that has + * been configured for this connection. If not, then trigger + * the connection update procedure. + */ + if (hcon->role == HCI_ROLE_SLAVE && + (hcon->le_conn_interval < hcon->le_conn_min_interval || + hcon->le_conn_interval > hcon->le_conn_max_interval)) { + struct l2cap_conn_param_update_req req; - __l2cap_chan_add(conn, chan); + req.min = cpu_to_le16(hcon->le_conn_min_interval); + req.max = cpu_to_le16(hcon->le_conn_max_interval); + req.latency = cpu_to_le16(hcon->le_conn_latency); + req.to_multiplier = cpu_to_le16(hcon->le_supv_timeout); -clean: - l2cap_chan_unlock(pchan); + l2cap_send_cmd(conn, l2cap_get_ident(conn), + L2CAP_CONN_PARAM_UPDATE_REQ, sizeof(req), &req); + } } static void l2cap_conn_ready(struct l2cap_conn *conn) @@ -1502,17 +1484,11 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) BT_DBG("conn %p", conn); - /* For outgoing pairing which doesn't necessarily have an - * associated socket (e.g. mgmt_pair_device). - */ - if (hcon->out && hcon->type == LE_LINK) - smp_conn_security(hcon, hcon->pending_sec_level); + if (hcon->type == ACL_LINK) + l2cap_request_info(conn); mutex_lock(&conn->chan_lock); - if (hcon->type == LE_LINK) - l2cap_le_conn_ready(conn); - list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -1525,8 +1501,8 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) if (hcon->type == LE_LINK) { l2cap_le_start(chan); } else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { - l2cap_chan_ready(chan); - + if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) + l2cap_chan_ready(chan); } else if (chan->state == BT_CONNECT) { l2cap_do_start(chan); } @@ -1536,6 +1512,9 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) mutex_unlock(&conn->chan_lock); + if (hcon->type == LE_LINK) + l2cap_le_conn_ready(conn); + queue_work(hcon->hdev->workqueue, &conn->pending_rx_work); } @@ -1671,8 +1650,14 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (work_pending(&conn->pending_rx_work)) cancel_work_sync(&conn->pending_rx_work); + if (work_pending(&conn->id_addr_update_work)) + cancel_work_sync(&conn->id_addr_update_work); + l2cap_unregister_all_users(conn); + /* Force the connection to be immediately dropped */ + hcon->disc_timeout = 0; + mutex_lock(&conn->chan_lock); /* Kill channels */ @@ -1695,29 +1680,11 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) cancel_delayed_work_sync(&conn->info_timer); - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) { - cancel_delayed_work_sync(&conn->security_timer); - smp_chan_destroy(conn); - } - hcon->l2cap_data = NULL; conn->hchan = NULL; l2cap_conn_put(conn); } -static void security_timeout(struct work_struct *work) -{ - struct l2cap_conn *conn = container_of(work, struct l2cap_conn, - security_timer.work); - - BT_DBG("conn %p", conn); - - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { - smp_chan_destroy(conn); - l2cap_conn_del(conn->hcon, ETIMEDOUT); - } -} - static void l2cap_conn_free(struct kref *ref) { struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref); @@ -1726,9 +1693,10 @@ static void l2cap_conn_free(struct kref *ref) kfree(conn); } -void l2cap_conn_get(struct l2cap_conn *conn) +struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn) { kref_get(&conn->ref); + return conn; } EXPORT_SYMBOL(l2cap_conn_get); @@ -1770,6 +1738,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, src_match = !bacmp(&c->src, src); dst_match = !bacmp(&c->dst, dst); if (src_match && dst_match) { + l2cap_chan_hold(c); read_unlock(&chan_list_lock); return c; } @@ -1783,6 +1752,9 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, } } + if (c1) + l2cap_chan_hold(c1); + read_unlock(&chan_list_lock); return c1; @@ -2003,10 +1975,12 @@ static void l2cap_ertm_resend(struct l2cap_chan *chan) tx_skb->data + L2CAP_HDR_SIZE); } + /* Update FCS */ if (chan->fcs == L2CAP_FCS_CRC16) { - u16 fcs = crc16(0, (u8 *) tx_skb->data, tx_skb->len); - put_unaligned_le16(fcs, skb_put(tx_skb, - L2CAP_FCS_SIZE)); + u16 fcs = crc16(0, (u8 *) tx_skb->data, + tx_skb->len - L2CAP_FCS_SIZE); + put_unaligned_le16(fcs, skb_tail_pointer(tx_skb) - + L2CAP_FCS_SIZE); } l2cap_do_send(chan, tx_skb); @@ -2118,7 +2092,8 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan, struct sk_buff **frag; int sent = 0; - if (memcpy_fromiovec(skb_put(skb, count), msg->msg_iov, count)) + if (chan->ops->memcpy_fromiovec(chan, skb_put(skb, count), + msg->msg_iov, count)) return -EFAULT; sent += count; @@ -2131,18 +2106,17 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan, count = min_t(unsigned int, conn->mtu, len); - tmp = chan->ops->alloc_skb(chan, count, + tmp = chan->ops->alloc_skb(chan, 0, count, msg->msg_flags & MSG_DONTWAIT); if (IS_ERR(tmp)) return PTR_ERR(tmp); *frag = tmp; - if (memcpy_fromiovec(skb_put(*frag, count), msg->msg_iov, count)) + if (chan->ops->memcpy_fromiovec(chan, skb_put(*frag, count), + msg->msg_iov, count)) return -EFAULT; - (*frag)->priority = skb->priority; - sent += count; len -= count; @@ -2156,26 +2130,23 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan, } static struct sk_buff *l2cap_create_connless_pdu(struct l2cap_chan *chan, - struct msghdr *msg, size_t len, - u32 priority) + struct msghdr *msg, size_t len) { struct l2cap_conn *conn = chan->conn; struct sk_buff *skb; int err, count, hlen = L2CAP_HDR_SIZE + L2CAP_PSMLEN_SIZE; struct l2cap_hdr *lh; - BT_DBG("chan %p psm 0x%2.2x len %zu priority %u", chan, - __le16_to_cpu(chan->psm), len, priority); + BT_DBG("chan %p psm 0x%2.2x len %zu", chan, + __le16_to_cpu(chan->psm), len); count = min_t(unsigned int, (conn->mtu - hlen), len); - skb = chan->ops->alloc_skb(chan, count + hlen, + skb = chan->ops->alloc_skb(chan, hlen, count, msg->msg_flags & MSG_DONTWAIT); if (IS_ERR(skb)) return skb; - skb->priority = priority; - /* Create L2CAP header */ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); lh->cid = cpu_to_le16(chan->dcid); @@ -2191,8 +2162,7 @@ static struct sk_buff *l2cap_create_connless_pdu(struct l2cap_chan *chan, } static struct sk_buff *l2cap_create_basic_pdu(struct l2cap_chan *chan, - struct msghdr *msg, size_t len, - u32 priority) + struct msghdr *msg, size_t len) { struct l2cap_conn *conn = chan->conn; struct sk_buff *skb; @@ -2203,13 +2173,11 @@ static struct sk_buff *l2cap_create_basic_pdu(struct l2cap_chan *chan, count = min_t(unsigned int, (conn->mtu - L2CAP_HDR_SIZE), len); - skb = chan->ops->alloc_skb(chan, count + L2CAP_HDR_SIZE, + skb = chan->ops->alloc_skb(chan, L2CAP_HDR_SIZE, count, msg->msg_flags & MSG_DONTWAIT); if (IS_ERR(skb)) return skb; - skb->priority = priority; - /* Create L2CAP header */ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); lh->cid = cpu_to_le16(chan->dcid); @@ -2247,7 +2215,7 @@ static struct sk_buff *l2cap_create_iframe_pdu(struct l2cap_chan *chan, count = min_t(unsigned int, (conn->mtu - hlen), len); - skb = chan->ops->alloc_skb(chan, count + hlen, + skb = chan->ops->alloc_skb(chan, hlen, count, msg->msg_flags & MSG_DONTWAIT); if (IS_ERR(skb)) return skb; @@ -2316,7 +2284,6 @@ static int l2cap_segment_sdu(struct l2cap_chan *chan, } else { sar = L2CAP_SAR_START; sdu_len = len; - pdu_len -= L2CAP_SDULEN_SIZE; } while (len > 0) { @@ -2331,10 +2298,8 @@ static int l2cap_segment_sdu(struct l2cap_chan *chan, __skb_queue_tail(seg_queue, skb); len -= pdu_len; - if (sdu_len) { + if (sdu_len) sdu_len = 0; - pdu_len += L2CAP_SDULEN_SIZE; - } if (len <= pdu_len) { sar = L2CAP_SAR_END; @@ -2368,7 +2333,7 @@ static struct sk_buff *l2cap_create_le_flowctl_pdu(struct l2cap_chan *chan, count = min_t(unsigned int, (conn->mtu - hlen), len); - skb = chan->ops->alloc_skb(chan, count + hlen, + skb = chan->ops->alloc_skb(chan, hlen, count, msg->msg_flags & MSG_DONTWAIT); if (IS_ERR(skb)) return skb; @@ -2400,12 +2365,8 @@ static int l2cap_segment_le_sdu(struct l2cap_chan *chan, BT_DBG("chan %p, msg %p, len %zu", chan, msg, len); - pdu_len = chan->conn->mtu - L2CAP_HDR_SIZE; - - pdu_len = min_t(size_t, pdu_len, chan->remote_mps); - sdu_len = len; - pdu_len -= L2CAP_SDULEN_SIZE; + pdu_len = chan->remote_mps - L2CAP_SDULEN_SIZE; while (len > 0) { if (len <= pdu_len) @@ -2430,8 +2391,7 @@ static int l2cap_segment_le_sdu(struct l2cap_chan *chan, return 0; } -int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, - u32 priority) +int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) { struct sk_buff *skb; int err; @@ -2442,7 +2402,7 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, /* Connectionless channel */ if (chan->chan_type == L2CAP_CHAN_CONN_LESS) { - skb = l2cap_create_connless_pdu(chan, msg, len, priority); + skb = l2cap_create_connless_pdu(chan, msg, len); if (IS_ERR(skb)) return PTR_ERR(skb); @@ -2499,7 +2459,7 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, return -EMSGSIZE; /* Create a basic PDU */ - skb = l2cap_create_basic_pdu(chan, msg, len, priority); + skb = l2cap_create_basic_pdu(chan, msg, len); if (IS_ERR(skb)) return PTR_ERR(skb); @@ -2562,6 +2522,7 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, return err; } +EXPORT_SYMBOL_GPL(l2cap_chan_send); static void l2cap_send_srej(struct l2cap_chan *chan, u16 txseq) { @@ -3217,6 +3178,9 @@ done: switch (chan->mode) { case L2CAP_MODE_BASIC: + if (disable_ertm) + break; + if (!(chan->conn->feat_mask & L2CAP_FEAT_ERTM) && !(chan->conn->feat_mask & L2CAP_FEAT_STREAMING)) break; @@ -3829,7 +3793,7 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, chan->ident = cmd->ident; if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) { - if (l2cap_chan_check_security(chan)) { + if (l2cap_chan_check_security(chan, false)) { if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) { l2cap_state_change(chan, BT_CONNECT2); result = L2CAP_CR_PEND; @@ -3863,6 +3827,7 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, response: l2cap_chan_unlock(pchan); mutex_unlock(&conn->chan_lock); + l2cap_chan_put(pchan); sendresp: rsp.scid = cpu_to_le16(scid); @@ -5197,27 +5162,6 @@ static inline int l2cap_move_channel_confirm_rsp(struct l2cap_conn *conn, return 0; } -static inline int l2cap_check_conn_param(u16 min, u16 max, u16 latency, - u16 to_multiplier) -{ - u16 max_latency; - - if (min > max || min < 6 || max > 3200) - return -EINVAL; - - if (to_multiplier < 10 || to_multiplier > 3200) - return -EINVAL; - - if (max >= to_multiplier * 8) - return -EINVAL; - - max_latency = (to_multiplier * 8 / max) - 1; - if (latency > 499 || latency > max_latency) - return -EINVAL; - - return 0; -} - static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) @@ -5228,7 +5172,7 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, u16 min, max, latency, to_multiplier; int err; - if (!(hcon->link_mode & HCI_LM_MASTER)) + if (hcon->role != HCI_ROLE_MASTER) return -EINVAL; if (cmd_len != sizeof(struct l2cap_conn_param_update_req)) @@ -5245,7 +5189,7 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, memset(&rsp, 0, sizeof(rsp)); - err = l2cap_check_conn_param(min, max, latency, to_multiplier); + err = hci_check_conn_params(min, max, latency, to_multiplier); if (err) rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); else @@ -5254,8 +5198,16 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_PARAM_UPDATE_RSP, sizeof(rsp), &rsp); - if (!err) - hci_le_conn_update(hcon, min, max, latency, to_multiplier); + if (!err) { + u8 store_hint; + + store_hint = hci_le_conn_update(hcon, min, max, latency, + to_multiplier); + mgmt_new_conn_param(hcon->hdev, &hcon->dst, hcon->dst_type, + store_hint, min, max, latency, + to_multiplier); + + } return 0; } @@ -5479,6 +5431,11 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) { l2cap_state_change(chan, BT_CONNECT2); + /* The following result value is actually not defined + * for LE CoC but we use it to let the function know + * that it should bail out after doing its cleanup + * instead of sending a response. + */ result = L2CAP_CR_PEND; chan->ops->defer(chan); } else { @@ -5489,6 +5446,7 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, response_unlock: l2cap_chan_unlock(pchan); mutex_unlock(&conn->chan_lock); + l2cap_chan_put(pchan); if (result == L2CAP_CR_PEND) return 0; @@ -6837,12 +6795,12 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, struct l2cap_chan *chan; if (hcon->type != ACL_LINK) - goto drop; + goto free_skb; chan = l2cap_global_chan_by_psm(0, psm, &hcon->src, &hcon->dst, ACL_LINK); if (!chan) - goto drop; + goto free_skb; BT_DBG("chan %p, len %d", chan, skb->len); @@ -6856,39 +6814,14 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, bacpy(&bt_cb(skb)->bdaddr, &hcon->dst); bt_cb(skb)->psm = psm; - if (!chan->ops->recv(chan, skb)) - return; - -drop: - kfree_skb(skb); -} - -static void l2cap_att_channel(struct l2cap_conn *conn, - struct sk_buff *skb) -{ - struct hci_conn *hcon = conn->hcon; - struct l2cap_chan *chan; - - if (hcon->type != LE_LINK) - goto drop; - - chan = l2cap_global_chan_by_scid(BT_CONNECTED, L2CAP_CID_ATT, - &hcon->src, &hcon->dst); - if (!chan) - goto drop; - - BT_DBG("chan %p, len %d", chan, skb->len); - - if (hci_blacklist_lookup(hcon->hdev, &hcon->dst, hcon->dst_type)) - goto drop; - - if (chan->imtu < skb->len) - goto drop; - - if (!chan->ops->recv(chan, skb)) + if (!chan->ops->recv(chan, skb)) { + l2cap_chan_put(chan); return; + } drop: + l2cap_chan_put(chan); +free_skb: kfree_skb(skb); } @@ -6914,6 +6847,16 @@ static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) return; } + /* Since we can't actively block incoming LE connections we must + * at least ensure that we ignore incoming data from them. + */ + if (hcon->type == LE_LINK && + hci_bdaddr_list_lookup(&hcon->hdev->blacklist, &hcon->dst, + bdaddr_type(hcon, hcon->dst_type))) { + kfree_skb(skb); + return; + } + BT_DBG("len %d, cid 0x%4.4x", len, cid); switch (cid) { @@ -6927,23 +6870,10 @@ static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) l2cap_conless_channel(conn, psm, skb); break; - case L2CAP_CID_ATT: - l2cap_att_channel(conn, skb); - break; - case L2CAP_CID_LE_SIGNALING: l2cap_le_sig_channel(conn, skb); break; - case L2CAP_CID_SMP: - if (smp_sig_channel(conn, skb)) - l2cap_conn_del(conn->hcon, EACCES); - break; - - case L2CAP_FC_6LOWPAN: - bt_6lowpan_recv(conn, skb); - break; - default: l2cap_data_channel(conn, cid, skb); break; @@ -6974,7 +6904,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) if (!hchan) return NULL; - conn = kzalloc(sizeof(struct l2cap_conn), GFP_KERNEL); + conn = kzalloc(sizeof(*conn), GFP_KERNEL); if (!conn) { hci_chan_del(hchan); return NULL; @@ -6982,8 +6912,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) kref_init(&conn->ref); hcon->l2cap_data = conn; - conn->hcon = hcon; - hci_conn_get(conn->hcon); + conn->hcon = hci_conn_get(hcon); conn->hchan = hchan; BT_DBG("hcon %p conn %p hchan %p", hcon, conn, hchan); @@ -7006,19 +6935,17 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) conn->hs_enabled = test_bit(HCI_HS_ENABLED, &hcon->hdev->dev_flags); - spin_lock_init(&conn->lock); + mutex_init(&conn->ident_lock); mutex_init(&conn->chan_lock); INIT_LIST_HEAD(&conn->chan_l); INIT_LIST_HEAD(&conn->users); - if (hcon->type == LE_LINK) - INIT_DELAYED_WORK(&conn->security_timer, security_timeout); - else - INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); + INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); skb_queue_head_init(&conn->pending_rx); INIT_WORK(&conn->pending_rx_work, process_pending_rx); + INIT_WORK(&conn->id_addr_update_work, l2cap_conn_update_id_addr); conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM; @@ -7042,7 +6969,6 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, struct l2cap_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; - __u8 auth_type; int err; BT_DBG("%pMR -> %pMR (type %u) psm 0x%2.2x", &chan->src, dst, @@ -7054,8 +6980,6 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, hci_dev_lock(hdev); - l2cap_chan_lock(chan); - if (!is_valid_psm(__le16_to_cpu(psm), dst_type) && !cid && chan->chan_type != L2CAP_CHAN_RAW) { err = -EINVAL; @@ -7084,7 +7008,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, break; /* fall through */ default: - err = -ENOTSUPP; + err = -EOPNOTSUPP; goto done; } @@ -7118,9 +7042,9 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, chan->psm = psm; chan->dcid = cid; - auth_type = l2cap_get_auth_type(chan); - if (bdaddr_type_is_le(dst_type)) { + u8 role; + /* Convert from L2CAP channel address type to HCI address type */ if (dst_type == BDADDR_LE_PUBLIC) @@ -7128,9 +7052,15 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, else dst_type = ADDR_LE_DEV_RANDOM; + if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) + role = HCI_ROLE_SLAVE; + else + role = HCI_ROLE_MASTER; + hcon = hci_connect_le(hdev, dst, dst_type, chan->sec_level, - auth_type); + HCI_LE_CONN_TIMEOUT, role); } else { + u8 auth_type = l2cap_get_auth_type(chan); hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type); } @@ -7146,19 +7076,20 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, goto done; } + mutex_lock(&conn->chan_lock); + l2cap_chan_lock(chan); + if (cid && __l2cap_get_chan_by_dcid(conn, cid)) { hci_conn_drop(hcon); err = -EBUSY; - goto done; + goto chan_unlock; } /* Update source addr of the socket */ bacpy(&chan->src, &hcon->src); chan->src_type = bdaddr_type(hcon, hcon->src_type); - l2cap_chan_unlock(chan); - l2cap_chan_add(conn, chan); - l2cap_chan_lock(chan); + __l2cap_chan_add(conn, chan); /* l2cap_chan_add takes its own ref so we can drop this one */ hci_conn_drop(hcon); @@ -7176,7 +7107,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, if (hcon->state == BT_CONNECTED) { if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { __clear_chan_timer(chan); - if (l2cap_chan_check_security(chan)) + if (l2cap_chan_check_security(chan, true)) l2cap_state_change(chan, BT_CONNECTED); } else l2cap_do_start(chan); @@ -7184,12 +7115,15 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, err = 0; -done: +chan_unlock: l2cap_chan_unlock(chan); + mutex_unlock(&conn->chan_lock); +done: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } +EXPORT_SYMBOL_GPL(l2cap_chan_connect); /* ---- L2CAP interface with lower layer (HCI) ---- */ @@ -7222,19 +7156,99 @@ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr) return exact ? lm1 : lm2; } +/* Find the next fixed channel in BT_LISTEN state, continue iteration + * from an existing channel in the list or from the beginning of the + * global list (by passing NULL as first parameter). + */ +static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c, + bdaddr_t *src, u8 link_type) +{ + read_lock(&chan_list_lock); + + if (c) + c = list_next_entry(c, global_l); + else + c = list_entry(chan_list.next, typeof(*c), global_l); + + list_for_each_entry_from(c, &chan_list, global_l) { + if (c->chan_type != L2CAP_CHAN_FIXED) + continue; + if (c->state != BT_LISTEN) + continue; + if (bacmp(&c->src, src) && bacmp(&c->src, BDADDR_ANY)) + continue; + if (link_type == ACL_LINK && c->src_type != BDADDR_BREDR) + continue; + if (link_type == LE_LINK && c->src_type == BDADDR_BREDR) + continue; + + l2cap_chan_hold(c); + read_unlock(&chan_list_lock); + return c; + } + + read_unlock(&chan_list_lock); + + return NULL; +} + void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) { + struct hci_dev *hdev = hcon->hdev; struct l2cap_conn *conn; + struct l2cap_chan *pchan; + u8 dst_type; BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); - if (!status) { - conn = l2cap_conn_add(hcon); - if (conn) - l2cap_conn_ready(conn); - } else { + if (status) { l2cap_conn_del(hcon, bt_to_errno(status)); + return; } + + conn = l2cap_conn_add(hcon); + if (!conn) + return; + + dst_type = bdaddr_type(hcon, hcon->dst_type); + + /* If device is blocked, do not create channels for it */ + if (hci_bdaddr_list_lookup(&hdev->blacklist, &hcon->dst, dst_type)) + return; + + /* Find fixed channels and notify them of the new connection. We + * use multiple individual lookups, continuing each time where + * we left off, because the list lock would prevent calling the + * potentially sleeping l2cap_chan_lock() function. + */ + pchan = l2cap_global_fixed_chan(NULL, &hdev->bdaddr, hcon->type); + while (pchan) { + struct l2cap_chan *chan, *next; + + /* Client fixed channels should override server ones */ + if (__l2cap_get_chan_by_dcid(conn, pchan->scid)) + goto next; + + l2cap_chan_lock(pchan); + chan = pchan->ops->new_connection(pchan); + if (chan) { + bacpy(&chan->src, &hcon->src); + bacpy(&chan->dst, &hcon->dst); + chan->src_type = bdaddr_type(hcon, hcon->src_type); + chan->dst_type = dst_type; + + __l2cap_chan_add(conn, chan); + } + + l2cap_chan_unlock(pchan); +next: + next = l2cap_global_fixed_chan(pchan, &hdev->bdaddr, + hcon->type); + l2cap_chan_put(pchan); + pchan = next; + } + + l2cap_conn_ready(conn); } int l2cap_disconn_ind(struct hci_conn *hcon) @@ -7252,8 +7266,6 @@ void l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason) { BT_DBG("hcon %p reason %d", hcon, reason); - bt_6lowpan_del_conn(hcon->l2cap_data); - l2cap_conn_del(hcon, bt_to_errno(reason)); } @@ -7284,12 +7296,6 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt); - if (hcon->type == LE_LINK) { - if (!status && encrypt) - smp_distribute_keys(conn); - cancel_delayed_work(&conn->security_timer); - } - mutex_lock(&conn->chan_lock); list_for_each_entry(chan, &conn->chan_l, list) { @@ -7303,15 +7309,8 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) continue; } - if (chan->scid == L2CAP_CID_ATT) { - if (!status && encrypt) { - chan->sec_level = hcon->sec_level; - l2cap_chan_ready(chan); - } - - l2cap_chan_unlock(chan); - continue; - } + if (!status && encrypt) + chan->sec_level = hcon->sec_level; if (!__l2cap_no_conn_pending(chan)) { l2cap_chan_unlock(chan); @@ -7536,14 +7535,11 @@ int __init l2cap_init(void) debugfs_create_u16("l2cap_le_default_mps", 0644, bt_debugfs, &le_default_mps); - bt_6lowpan_init(); - return 0; } void l2cap_exit(void) { - bt_6lowpan_cleanup(); debugfs_remove(l2cap_debugfs); l2cap_cleanup_sockets(); } diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index e1378693cc90..31f106e61ca2 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -99,15 +99,6 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) if (!bdaddr_type_is_valid(la.l2_bdaddr_type)) return -EINVAL; - if (la.l2_cid) { - /* When the socket gets created it defaults to - * CHAN_CONN_ORIENTED, so we need to overwrite the - * default here. - */ - chan->chan_type = L2CAP_CHAN_FIXED; - chan->omtu = L2CAP_DEFAULT_MTU; - } - if (bdaddr_type_is_le(la.l2_bdaddr_type)) { /* We only allow ATT user space socket */ if (la.l2_cid && @@ -155,6 +146,14 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) case L2CAP_CHAN_RAW: chan->sec_level = BT_SECURITY_SDP; break; + case L2CAP_CHAN_FIXED: + /* Fixed channels default to the L2CAP core not holding a + * hci_conn reference for them. For fixed channels mapping to + * L2CAP sockets we do want to hold a reference so set the + * appropriate flag to request it. + */ + set_bit(FLAG_HOLD_HCI_CONN, &chan->flags); + break; } bacpy(&chan->src, &la.l2_bdaddr); @@ -279,7 +278,7 @@ static int l2cap_sock_listen(struct socket *sock, int backlog) break; /* fall through */ default: - err = -ENOTSUPP; + err = -EOPNOTSUPP; goto done; } @@ -361,7 +360,8 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, BT_DBG("sock %p, sk %p", sock, sk); if (peer && sk->sk_state != BT_CONNECTED && - sk->sk_state != BT_CONNECT && sk->sk_state != BT_CONNECT2) + sk->sk_state != BT_CONNECT && sk->sk_state != BT_CONNECT2 && + sk->sk_state != BT_CONFIG) return -ENOTCONN; memset(la, 0, sizeof(struct sockaddr_l2)); @@ -789,6 +789,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, if (chan->scid == L2CAP_CID_ATT) { if (smp_conn_security(conn->hcon, sec.level)) break; + set_bit(FLAG_PENDING_SECURITY, &chan->flags); sk->sk_state = BT_CONFIG; chan->state = BT_CONFIG; @@ -796,7 +797,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, } else if ((sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) || sk->sk_state == BT_CONNECTED) { - if (!l2cap_chan_check_security(chan)) + if (!l2cap_chan_check_security(chan, true)) set_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags); else sk->sk_state_change(sk); @@ -964,7 +965,7 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, return err; l2cap_chan_lock(chan); - err = l2cap_chan_send(chan, msg, len, sk->sk_priority); + err = l2cap_chan_send(chan, msg, len); l2cap_chan_unlock(chan); return err; @@ -1111,7 +1112,8 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) l2cap_chan_close(chan, 0); lock_sock(sk); - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && + !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); } @@ -1292,6 +1294,7 @@ static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state, } static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan, + unsigned long hdr_len, unsigned long len, int nb) { struct sock *sk = chan->data; @@ -1299,17 +1302,26 @@ static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan, int err; l2cap_chan_unlock(chan); - skb = bt_skb_send_alloc(sk, len, nb, &err); + skb = bt_skb_send_alloc(sk, hdr_len + len, nb, &err); l2cap_chan_lock(chan); if (!skb) return ERR_PTR(err); + skb->priority = sk->sk_priority; + bt_cb(skb)->chan = chan; return skb; } +static int l2cap_sock_memcpy_fromiovec_cb(struct l2cap_chan *chan, + unsigned char *kdata, + struct iovec *iov, int len) +{ + return memcpy_fromiovec(kdata, iov, len); +} + static void l2cap_sock_ready_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; @@ -1347,6 +1359,11 @@ static void l2cap_sock_resume_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; + if (test_and_clear_bit(FLAG_PENDING_SECURITY, &chan->flags)) { + sk->sk_state = BT_CONNECTED; + chan->state = BT_CONNECTED; + } + clear_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags); sk->sk_state_change(sk); } @@ -1375,20 +1392,21 @@ static void l2cap_sock_suspend_cb(struct l2cap_chan *chan) sk->sk_state_change(sk); } -static struct l2cap_ops l2cap_chan_ops = { - .name = "L2CAP Socket Interface", - .new_connection = l2cap_sock_new_connection_cb, - .recv = l2cap_sock_recv_cb, - .close = l2cap_sock_close_cb, - .teardown = l2cap_sock_teardown_cb, - .state_change = l2cap_sock_state_change_cb, - .ready = l2cap_sock_ready_cb, - .defer = l2cap_sock_defer_cb, - .resume = l2cap_sock_resume_cb, - .suspend = l2cap_sock_suspend_cb, - .set_shutdown = l2cap_sock_set_shutdown_cb, - .get_sndtimeo = l2cap_sock_get_sndtimeo_cb, - .alloc_skb = l2cap_sock_alloc_skb_cb, +static const struct l2cap_ops l2cap_chan_ops = { + .name = "L2CAP Socket Interface", + .new_connection = l2cap_sock_new_connection_cb, + .recv = l2cap_sock_recv_cb, + .close = l2cap_sock_close_cb, + .teardown = l2cap_sock_teardown_cb, + .state_change = l2cap_sock_state_change_cb, + .ready = l2cap_sock_ready_cb, + .defer = l2cap_sock_defer_cb, + .resume = l2cap_sock_resume_cb, + .suspend = l2cap_sock_suspend_cb, + .set_shutdown = l2cap_sock_set_shutdown_cb, + .get_sndtimeo = l2cap_sock_get_sndtimeo_cb, + .alloc_skb = l2cap_sock_alloc_skb_cb, + .memcpy_fromiovec = l2cap_sock_memcpy_fromiovec_cb, }; static void l2cap_sock_destruct(struct sock *sk) diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index 941ad7530eda..b36bc0415854 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -135,40 +135,34 @@ int bt_to_errno(__u16 code) } EXPORT_SYMBOL(bt_to_errno); -int bt_info(const char *format, ...) +void bt_info(const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = pr_info("%pV", &vaf); + pr_info("%pV", &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(bt_info); -int bt_err(const char *format, ...) +void bt_err(const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = pr_err("%pV", &vaf); + pr_err("%pV", &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(bt_err); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index af8e0a6243b7..efb71b022ab6 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -35,7 +35,7 @@ #include "smp.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 6 +#define MGMT_REVISION 7 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -44,7 +44,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_SET_DISCOVERABLE, MGMT_OP_SET_CONNECTABLE, MGMT_OP_SET_FAST_CONNECTABLE, - MGMT_OP_SET_PAIRABLE, + MGMT_OP_SET_BONDABLE, MGMT_OP_SET_LINK_SECURITY, MGMT_OP_SET_SSP, MGMT_OP_SET_HS, @@ -85,6 +85,14 @@ static const u16 mgmt_commands[] = { MGMT_OP_SET_PRIVACY, MGMT_OP_LOAD_IRKS, MGMT_OP_GET_CONN_INFO, + MGMT_OP_GET_CLOCK_INFO, + MGMT_OP_ADD_DEVICE, + MGMT_OP_REMOVE_DEVICE, + MGMT_OP_LOAD_CONN_PARAM, + MGMT_OP_READ_UNCONF_INDEX_LIST, + MGMT_OP_READ_CONFIG_INFO, + MGMT_OP_SET_EXTERNAL_CONFIG, + MGMT_OP_SET_PUBLIC_ADDRESS, }; static const u16 mgmt_events[] = { @@ -111,13 +119,16 @@ static const u16 mgmt_events[] = { MGMT_EV_PASSKEY_NOTIFY, MGMT_EV_NEW_IRK, MGMT_EV_NEW_CSRK, + MGMT_EV_DEVICE_ADDED, + MGMT_EV_DEVICE_REMOVED, + MGMT_EV_NEW_CONN_PARAM, + MGMT_EV_UNCONF_INDEX_ADDED, + MGMT_EV_UNCONF_INDEX_REMOVED, + MGMT_EV_NEW_CONFIG_OPTIONS, }; #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) -#define hdev_is_powered(hdev) (test_bit(HCI_UP, &hdev->flags) && \ - !test_bit(HCI_AUTO_OFF, &hdev->dev_flags)) - struct pending_cmd { struct list_head list; u16 opcode; @@ -200,6 +211,36 @@ static u8 mgmt_status(u8 hci_status) return MGMT_STATUS_FAILED; } +static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 data_len, + struct sock *skip_sk) +{ + struct sk_buff *skb; + struct mgmt_hdr *hdr; + + skb = alloc_skb(sizeof(*hdr) + data_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = (void *) skb_put(skb, sizeof(*hdr)); + hdr->opcode = cpu_to_le16(event); + if (hdev) + hdr->index = cpu_to_le16(hdev->id); + else + hdr->index = cpu_to_le16(MGMT_INDEX_NONE); + hdr->len = cpu_to_le16(data_len); + + if (data) + memcpy(skb_put(skb, data_len), data, data_len); + + /* Time stamp */ + __net_timestamp(skb); + + hci_send_to_control(skb, skip_sk); + kfree_skb(skb); + + return 0; +} + static int cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) { struct sk_buff *skb; @@ -327,7 +368,8 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_BREDR) + if (d->dev_type == HCI_BREDR && + !test_bit(HCI_UNCONFIGURED, &d->dev_flags)) count++; } @@ -340,13 +382,19 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (test_bit(HCI_SETUP, &d->dev_flags)) + if (test_bit(HCI_SETUP, &d->dev_flags) || + test_bit(HCI_CONFIG, &d->dev_flags) || + test_bit(HCI_USER_CHANNEL, &d->dev_flags)) continue; - if (test_bit(HCI_USER_CHANNEL, &d->dev_flags)) + /* Devices marked as raw-only are neither configured + * nor unconfigured controllers. + */ + if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_BREDR) { + if (d->dev_type == HCI_BREDR && + !test_bit(HCI_UNCONFIGURED, &d->dev_flags)) { rp->index[count++] = cpu_to_le16(d->id); BT_DBG("Added hci%u", d->id); } @@ -365,19 +413,151 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, return err; } +static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_rp_read_unconf_index_list *rp; + struct hci_dev *d; + size_t rp_len; + u16 count; + int err; + + BT_DBG("sock %p", sk); + + read_lock(&hci_dev_list_lock); + + count = 0; + list_for_each_entry(d, &hci_dev_list, list) { + if (d->dev_type == HCI_BREDR && + test_bit(HCI_UNCONFIGURED, &d->dev_flags)) + count++; + } + + rp_len = sizeof(*rp) + (2 * count); + rp = kmalloc(rp_len, GFP_ATOMIC); + if (!rp) { + read_unlock(&hci_dev_list_lock); + return -ENOMEM; + } + + count = 0; + list_for_each_entry(d, &hci_dev_list, list) { + if (test_bit(HCI_SETUP, &d->dev_flags) || + test_bit(HCI_CONFIG, &d->dev_flags) || + test_bit(HCI_USER_CHANNEL, &d->dev_flags)) + continue; + + /* Devices marked as raw-only are neither configured + * nor unconfigured controllers. + */ + if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) + continue; + + if (d->dev_type == HCI_BREDR && + test_bit(HCI_UNCONFIGURED, &d->dev_flags)) { + rp->index[count++] = cpu_to_le16(d->id); + BT_DBG("Added hci%u", d->id); + } + } + + rp->num_controllers = cpu_to_le16(count); + rp_len = sizeof(*rp) + (2 * count); + + read_unlock(&hci_dev_list_lock); + + err = cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_UNCONF_INDEX_LIST, + 0, rp, rp_len); + + kfree(rp); + + return err; +} + +static bool is_configured(struct hci_dev *hdev) +{ + if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) && + !test_bit(HCI_EXT_CONFIGURED, &hdev->dev_flags)) + return false; + + if (test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) && + !bacmp(&hdev->public_addr, BDADDR_ANY)) + return false; + + return true; +} + +static __le32 get_missing_options(struct hci_dev *hdev) +{ + u32 options = 0; + + if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) && + !test_bit(HCI_EXT_CONFIGURED, &hdev->dev_flags)) + options |= MGMT_OPTION_EXTERNAL_CONFIG; + + if (test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) && + !bacmp(&hdev->public_addr, BDADDR_ANY)) + options |= MGMT_OPTION_PUBLIC_ADDRESS; + + return cpu_to_le32(options); +} + +static int new_options(struct hci_dev *hdev, struct sock *skip) +{ + __le32 options = get_missing_options(hdev); + + return mgmt_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options, + sizeof(options), skip); +} + +static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) +{ + __le32 options = get_missing_options(hdev); + + return cmd_complete(sk, hdev->id, opcode, 0, &options, + sizeof(options)); +} + +static int read_config_info(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_rp_read_config_info rp; + u32 options = 0; + + BT_DBG("sock %p %s", sk, hdev->name); + + hci_dev_lock(hdev); + + memset(&rp, 0, sizeof(rp)); + rp.manufacturer = cpu_to_le16(hdev->manufacturer); + + if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks)) + options |= MGMT_OPTION_EXTERNAL_CONFIG; + + if (hdev->set_bdaddr) + options |= MGMT_OPTION_PUBLIC_ADDRESS; + + rp.supported_options = cpu_to_le32(options); + rp.missing_options = get_missing_options(hdev); + + hci_dev_unlock(hdev); + + return cmd_complete(sk, hdev->id, MGMT_OP_READ_CONFIG_INFO, 0, &rp, + sizeof(rp)); +} + static u32 get_supported_settings(struct hci_dev *hdev) { u32 settings = 0; settings |= MGMT_SETTING_POWERED; - settings |= MGMT_SETTING_PAIRABLE; + settings |= MGMT_SETTING_BONDABLE; settings |= MGMT_SETTING_DEBUG_KEYS; + settings |= MGMT_SETTING_CONNECTABLE; + settings |= MGMT_SETTING_DISCOVERABLE; if (lmp_bredr_capable(hdev)) { - settings |= MGMT_SETTING_CONNECTABLE; if (hdev->hci_ver >= BLUETOOTH_VER_1_2) settings |= MGMT_SETTING_FAST_CONNECTABLE; - settings |= MGMT_SETTING_DISCOVERABLE; settings |= MGMT_SETTING_BREDR; settings |= MGMT_SETTING_LINK_SECURITY; @@ -387,7 +567,7 @@ static u32 get_supported_settings(struct hci_dev *hdev) } if (lmp_sc_capable(hdev) || - test_bit(HCI_FORCE_SC, &hdev->dev_flags)) + test_bit(HCI_FORCE_SC, &hdev->dbg_flags)) settings |= MGMT_SETTING_SECURE_CONN; } @@ -397,6 +577,10 @@ static u32 get_supported_settings(struct hci_dev *hdev) settings |= MGMT_SETTING_PRIVACY; } + if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || + hdev->set_bdaddr) + settings |= MGMT_SETTING_CONFIGURATION; + return settings; } @@ -416,8 +600,8 @@ static u32 get_current_settings(struct hci_dev *hdev) if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) settings |= MGMT_SETTING_DISCOVERABLE; - if (test_bit(HCI_PAIRABLE, &hdev->dev_flags)) - settings |= MGMT_SETTING_PAIRABLE; + if (test_bit(HCI_BONDABLE, &hdev->dev_flags)) + settings |= MGMT_SETTING_BONDABLE; if (test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) settings |= MGMT_SETTING_BREDR; @@ -440,7 +624,7 @@ static u32 get_current_settings(struct hci_dev *hdev) if (test_bit(HCI_SC_ENABLED, &hdev->dev_flags)) settings |= MGMT_SETTING_SECURE_CONN; - if (test_bit(HCI_DEBUG_KEYS, &hdev->dev_flags)) + if (test_bit(HCI_KEEP_DEBUG_KEYS, &hdev->dev_flags)) settings |= MGMT_SETTING_DEBUG_KEYS; if (test_bit(HCI_PRIVACY, &hdev->dev_flags)) @@ -571,6 +755,22 @@ static struct pending_cmd *mgmt_pending_find(u16 opcode, struct hci_dev *hdev) return NULL; } +static struct pending_cmd *mgmt_pending_find_data(u16 opcode, + struct hci_dev *hdev, + const void *data) +{ + struct pending_cmd *cmd; + + list_for_each_entry(cmd, &hdev->mgmt_pending, list) { + if (cmd->user_data != data) + continue; + if (cmd->opcode == opcode) + return cmd; + } + + return NULL; +} + static u8 create_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) { u8 ad_len = 0; @@ -703,6 +903,16 @@ static void update_adv_data(struct hci_request *req) hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); } +int mgmt_update_adv_data(struct hci_dev *hdev) +{ + struct hci_request req; + + hci_req_init(&req, hdev); + update_adv_data(&req); + + return hci_req_run(&req, NULL); +} + static void create_eir(struct hci_dev *hdev, u8 *data) { u8 *ptr = data; @@ -836,6 +1046,13 @@ static bool get_connectable(struct hci_dev *hdev) return test_bit(HCI_CONNECTABLE, &hdev->dev_flags); } +static void disable_advertising(struct hci_request *req) +{ + u8 enable = 0x00; + + hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); +} + static void enable_advertising(struct hci_request *req) { struct hci_dev *hdev = req->hdev; @@ -843,12 +1060,18 @@ static void enable_advertising(struct hci_request *req) u8 own_addr_type, enable = 0x01; bool connectable; - /* Clear the HCI_ADVERTISING bit temporarily so that the + if (hci_conn_num(hdev, LE_LINK) > 0) + return; + + if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) + disable_advertising(req); + + /* Clear the HCI_LE_ADV bit temporarily so that the * hci_update_random_address knows that it's safe to go ahead * and write a new random address. The flag will be set back on * as soon as the SET_ADV_ENABLE HCI command completes. */ - clear_bit(HCI_ADVERTISING, &hdev->dev_flags); + clear_bit(HCI_LE_ADV, &hdev->dev_flags); connectable = get_connectable(hdev); @@ -860,8 +1083,8 @@ static void enable_advertising(struct hci_request *req) return; memset(&cp, 0, sizeof(cp)); - cp.min_interval = cpu_to_le16(0x0800); - cp.max_interval = cpu_to_le16(0x0800); + cp.min_interval = cpu_to_le16(hdev->le_adv_min_interval); + cp.max_interval = cpu_to_le16(hdev->le_adv_max_interval); cp.type = connectable ? LE_ADV_IND : LE_ADV_NONCONN_IND; cp.own_address_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; @@ -871,13 +1094,6 @@ static void enable_advertising(struct hci_request *req) hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); } -static void disable_advertising(struct hci_request *req) -{ - u8 enable = 0x00; - - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); -} - static void service_cache_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, @@ -909,19 +1125,14 @@ static void rpa_expired(struct work_struct *work) set_bit(HCI_RPA_EXPIRED, &hdev->dev_flags); - if (!test_bit(HCI_ADVERTISING, &hdev->dev_flags) || - hci_conn_num(hdev, LE_LINK) > 0) + if (!test_bit(HCI_ADVERTISING, &hdev->dev_flags)) return; /* The generation of a new RPA and programming it into the * controller happens in the enable_advertising() function. */ - hci_req_init(&req, hdev); - - disable_advertising(&req); enable_advertising(&req); - hci_req_run(&req, NULL); } @@ -938,7 +1149,7 @@ static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev) * for mgmt we require user-space to explicitly enable * it */ - clear_bit(HCI_PAIRABLE, &hdev->dev_flags); + clear_bit(HCI_BONDABLE, &hdev->dev_flags); } static int read_controller_info(struct sock *sk, struct hci_dev *hdev, @@ -984,7 +1195,7 @@ static struct pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, { struct pending_cmd *cmd; - cmd = kmalloc(sizeof(*cmd), GFP_KERNEL); + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); if (!cmd) return NULL; @@ -1047,7 +1258,7 @@ static void clean_up_hci_complete(struct hci_dev *hdev, u8 status) } } -static void hci_stop_discovery(struct hci_request *req) +static bool hci_stop_discovery(struct hci_request *req) { struct hci_dev *hdev = req->hdev; struct hci_cp_remote_name_req_cancel cp; @@ -1062,32 +1273,39 @@ static void hci_stop_discovery(struct hci_request *req) hci_req_add_le_scan_disable(req); } - break; + return true; case DISCOVERY_RESOLVING: e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_PENDING); if (!e) - return; + break; bacpy(&cp.bdaddr, &e->data.bdaddr); hci_req_add(req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp), &cp); - break; + return true; default: /* Passive scanning */ - if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) + if (test_bit(HCI_LE_SCAN, &hdev->dev_flags)) { hci_req_add_le_scan_disable(req); + return true; + } + break; } + + return false; } static int clean_up_hci_state(struct hci_dev *hdev) { struct hci_request req; struct hci_conn *conn; + bool discov_stopped; + int err; hci_req_init(&req, hdev); @@ -1097,10 +1315,10 @@ static int clean_up_hci_state(struct hci_dev *hdev) hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } - if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) + if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) disable_advertising(&req); - hci_stop_discovery(&req); + discov_stopped = hci_stop_discovery(&req); list_for_each_entry(conn, &hdev->conn_hash.list, list) { struct hci_cp_disconnect dc; @@ -1134,7 +1352,11 @@ static int clean_up_hci_state(struct hci_dev *hdev) } } - return hci_req_run(&req, clean_up_hci_complete); + err = hci_req_run(&req, clean_up_hci_complete); + if (!err && discov_stopped) + hci_discovery_set_state(hdev, DISCOVERY_STOPPING); + + return err; } static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, @@ -1203,36 +1425,6 @@ failed: return err; } -static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 data_len, - struct sock *skip_sk) -{ - struct sk_buff *skb; - struct mgmt_hdr *hdr; - - skb = alloc_skb(sizeof(*hdr) + data_len, GFP_KERNEL); - if (!skb) - return -ENOMEM; - - hdr = (void *) skb_put(skb, sizeof(*hdr)); - hdr->opcode = cpu_to_le16(event); - if (hdev) - hdr->index = cpu_to_le16(hdev->id); - else - hdr->index = cpu_to_le16(MGMT_INDEX_NONE); - hdr->len = cpu_to_le16(data_len); - - if (data) - memcpy(skb_put(skb, data_len), data, data_len); - - /* Time stamp */ - __net_timestamp(skb); - - hci_send_to_control(skb, skip_sk); - kfree_skb(skb); - - return 0; -} - static int new_settings(struct hci_dev *hdev, struct sock *skip) { __le32 ev; @@ -1242,6 +1434,11 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip) return mgmt_event(MGMT_EV_NEW_SETTINGS, hdev, &ev, sizeof(ev), skip); } +int mgmt_new_settings(struct hci_dev *hdev) +{ + return new_settings(hdev, NULL); +} + struct cmd_lookup { struct sock *sk; struct hci_dev *hdev; @@ -1336,9 +1533,11 @@ static void set_discoverable_complete(struct hci_dev *hdev, u8 status) /* When the discoverable mode gets changed, make sure * that class of device has the limited discoverable - * bit correctly set. + * bit correctly set. Also update page scan based on whitelist + * entries. */ hci_req_init(&req, hdev); + hci_update_page_scan(hdev, &req); update_class(&req); hci_req_run(&req, NULL); @@ -1553,7 +1752,7 @@ static void set_connectable_complete(struct hci_dev *hdev, u8 status) { struct pending_cmd *cmd; struct mgmt_mode *cp; - bool changed; + bool conn_changed, discov_changed; BT_DBG("status 0x%02x", status); @@ -1570,15 +1769,26 @@ static void set_connectable_complete(struct hci_dev *hdev, u8 status) } cp = cmd->param; - if (cp->val) - changed = !test_and_set_bit(HCI_CONNECTABLE, &hdev->dev_flags); - else - changed = test_and_clear_bit(HCI_CONNECTABLE, &hdev->dev_flags); + if (cp->val) { + conn_changed = !test_and_set_bit(HCI_CONNECTABLE, + &hdev->dev_flags); + discov_changed = false; + } else { + conn_changed = test_and_clear_bit(HCI_CONNECTABLE, + &hdev->dev_flags); + discov_changed = test_and_clear_bit(HCI_DISCOVERABLE, + &hdev->dev_flags); + } send_settings_rsp(cmd->sk, MGMT_OP_SET_CONNECTABLE, hdev); - if (changed) + if (conn_changed || discov_changed) { new_settings(hdev, cmd->sk); + hci_update_page_scan(hdev, NULL); + if (discov_changed) + mgmt_update_adv_data(hdev); + hci_update_background_scan(hdev); + } remove_cmd: mgmt_pending_remove(cmd); @@ -1607,8 +1817,11 @@ static int set_connectable_update_settings(struct hci_dev *hdev, if (err < 0) return err; - if (changed) + if (changed) { + hci_update_page_scan(hdev, NULL); + hci_update_background_scan(hdev); return new_settings(hdev, sk); + } return 0; } @@ -1669,7 +1882,18 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, if (cp->val) { scan = SCAN_PAGE; } else { - scan = 0; + /* If we don't have any whitelist entries just + * disable all scanning. If there are entries + * and we had both page and inquiry scanning + * enabled then fall back to only page scanning. + * Otherwise no changes are needed. + */ + if (list_empty(&hdev->whitelist)) + scan = SCAN_DISABLED; + else if (test_bit(HCI_ISCAN, &hdev->flags)) + scan = SCAN_PAGE; + else + goto no_scan_update; if (test_bit(HCI_ISCAN, &hdev->flags) && hdev->discov_timeout > 0) @@ -1679,6 +1903,7 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } +no_scan_update: /* If we're going from non-connectable to connectable or * vice-versa when fast connectable is enabled ensure that fast * connectable gets disabled. write_fast_connectable won't do @@ -1688,11 +1913,9 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, if (cp->val || test_bit(HCI_FAST_CONNECTABLE, &hdev->dev_flags)) write_fast_connectable(&req, false); - if (test_bit(HCI_ADVERTISING, &hdev->dev_flags) && - hci_conn_num(hdev, LE_LINK) == 0) { - disable_advertising(&req); + /* Update the advertising parameters if necessary */ + if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) enable_advertising(&req); - } err = hci_req_run(&req, set_connectable_complete); if (err < 0) { @@ -1708,7 +1931,7 @@ failed: return err; } -static int set_pairable(struct sock *sk, struct hci_dev *hdev, void *data, +static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; @@ -1718,17 +1941,17 @@ static int set_pairable(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("request for %s", hdev->name); if (cp->val != 0x00 && cp->val != 0x01) - return cmd_status(sk, hdev->id, MGMT_OP_SET_PAIRABLE, + return cmd_status(sk, hdev->id, MGMT_OP_SET_BONDABLE, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (cp->val) - changed = !test_and_set_bit(HCI_PAIRABLE, &hdev->dev_flags); + changed = !test_and_set_bit(HCI_BONDABLE, &hdev->dev_flags); else - changed = test_and_clear_bit(HCI_PAIRABLE, &hdev->dev_flags); + changed = test_and_clear_bit(HCI_BONDABLE, &hdev->dev_flags); - err = send_settings_rsp(sk, MGMT_OP_SET_PAIRABLE, hdev); + err = send_settings_rsp(sk, MGMT_OP_SET_BONDABLE, hdev); if (err < 0) goto unlock; @@ -1877,6 +2100,10 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) goto failed; } + if (!cp->val && test_bit(HCI_USE_DEBUG_KEYS, &hdev->dev_flags)) + hci_send_cmd(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE, + sizeof(cp->val), &cp->val); + err = hci_send_cmd(hdev, HCI_OP_WRITE_SSP_MODE, 1, &cp->val); if (err < 0) { mgmt_pending_remove(cmd); @@ -1973,6 +2200,8 @@ static void le_enable_complete(struct hci_dev *hdev, u8 status) update_scan_rsp_data(&req); hci_req_run(&req, NULL); + hci_update_background_scan(hdev); + hci_dev_unlock(hdev); } } @@ -2048,9 +2277,9 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) if (val) { hci_cp.le = val; - hci_cp.simul = lmp_le_br_capable(hdev); + hci_cp.simul = 0x00; } else { - if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) + if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) disable_advertising(&req); } @@ -2373,6 +2602,8 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_load_link_keys *cp = data; + const u16 max_key_count = ((U16_MAX - sizeof(*cp)) / + sizeof(struct mgmt_link_key_info)); u16 key_count, expected_len; bool changed; int i; @@ -2384,6 +2615,12 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, MGMT_STATUS_NOT_SUPPORTED); key_count = __le16_to_cpu(cp->key_count); + if (key_count > max_key_count) { + BT_ERR("load_link_keys: too big key_count value %u", + key_count); + return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, + MGMT_STATUS_INVALID_PARAMS); + } expected_len = sizeof(*cp) + key_count * sizeof(struct mgmt_link_key_info); @@ -2414,9 +2651,11 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, hci_link_keys_clear(hdev); if (cp->debug_keys) - changed = !test_and_set_bit(HCI_DEBUG_KEYS, &hdev->dev_flags); + changed = !test_and_set_bit(HCI_KEEP_DEBUG_KEYS, + &hdev->dev_flags); else - changed = test_and_clear_bit(HCI_DEBUG_KEYS, &hdev->dev_flags); + changed = test_and_clear_bit(HCI_KEEP_DEBUG_KEYS, + &hdev->dev_flags); if (changed) new_settings(hdev, NULL); @@ -2424,8 +2663,14 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, for (i = 0; i < key_count; i++) { struct mgmt_link_key_info *key = &cp->keys[i]; - hci_add_link_key(hdev, NULL, 0, &key->addr.bdaddr, key->val, - key->type, key->pin_len); + /* Always ignore debug keys and require a new pairing if + * the user wants to use them. + */ + if (key->type == HCI_LK_DEBUG_COMBINATION) + continue; + + hci_add_link_key(hdev, NULL, &key->addr.bdaddr, key->val, + key->type, key->pin_len, NULL); } cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, 0, NULL, 0); @@ -2543,7 +2788,6 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_disconnect *cp = data; struct mgmt_rp_disconnect rp; - struct hci_cp_disconnect dc; struct pending_cmd *cmd; struct hci_conn *conn; int err; @@ -2591,10 +2835,7 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - dc.handle = cpu_to_le16(conn->handle); - dc.reason = HCI_ERROR_REMOTE_USER_TERM; - - err = hci_send_cmd(hdev, HCI_OP_DISCONNECT, sizeof(dc), &dc); + err = hci_disconnect(conn, HCI_ERROR_REMOTE_USER_TERM); if (err < 0) mgmt_pending_remove(cmd); @@ -2766,6 +3007,10 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG(""); + if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY) + return cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, + MGMT_STATUS_INVALID_PARAMS, NULL, 0); + hci_dev_lock(hdev); hdev->io_capability = cp->io_capability; @@ -2814,6 +3059,7 @@ static void pairing_complete(struct pending_cmd *cmd, u8 status) conn->disconn_cfm_cb = NULL; hci_conn_drop(conn); + hci_conn_put(conn); mgmt_pending_remove(cmd); } @@ -2878,6 +3124,11 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, MGMT_STATUS_INVALID_PARAMS, &rp, sizeof(rp)); + if (cp->io_cap > SMP_IO_KEYBOARD_DISPLAY) + return cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); + hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { @@ -2902,8 +3153,20 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, else addr_type = ADDR_LE_DEV_RANDOM; + /* When pairing a new device, it is expected to remember + * this device for future connections. Adding the connection + * parameter information ahead of time allows tracking + * of the slave preferred values and will speed up any + * further connection establishment. + * + * If connection parameters already exist, then they + * will be kept and this function does nothing. + */ + hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type); + conn = hci_connect_le(hdev, &cp->addr.bdaddr, addr_type, - sec_level, auth_type); + sec_level, HCI_LE_CONN_TIMEOUT, + HCI_ROLE_MASTER); } if (IS_ERR(conn)) { @@ -2946,10 +3209,10 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, } conn->io_capability = cp->io_cap; - cmd->user_data = conn; + cmd->user_data = hci_conn_get(conn); - if (conn->state == BT_CONNECTED && - hci_conn_security(conn, sec_level, auth_type)) + if ((conn->state == BT_CONNECTED || conn->state == BT_CONFIG) && + hci_conn_security(conn, sec_level, auth_type, true)) pairing_complete(cmd, 0); err = 0; @@ -3031,14 +3294,7 @@ static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev, } if (addr->type == BDADDR_LE_PUBLIC || addr->type == BDADDR_LE_RANDOM) { - /* Continue with pairing via SMP. The hdev lock must be - * released as SMP may try to recquire it for crypto - * purposes. - */ - hci_dev_unlock(hdev); err = smp_user_confirm_reply(conn, mgmt_op, passkey); - hci_dev_lock(hdev); - if (!err) err = cmd_complete(sk, hdev->id, mgmt_op, MGMT_STATUS_SUCCESS, addr, @@ -3516,11 +3772,21 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev, goto failed; } - if (test_bit(HCI_ADVERTISING, &hdev->dev_flags)) { - err = cmd_status(sk, hdev->id, MGMT_OP_START_DISCOVERY, - MGMT_STATUS_REJECTED); - mgmt_pending_remove(cmd); - goto failed; + if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) { + /* Don't let discovery abort an outgoing + * connection attempt that's using directed + * advertising. + */ + if (hci_conn_hash_lookup_state(hdev, LE_LINK, + BT_CONNECT)) { + err = cmd_status(sk, hdev->id, + MGMT_OP_START_DISCOVERY, + MGMT_STATUS_REJECTED); + mgmt_pending_remove(cmd); + goto failed; + } + + disable_advertising(&req); } /* If controller is scanning, it means the background scanning @@ -3723,12 +3989,18 @@ static int block_device(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - err = hci_blacklist_add(hdev, &cp->addr.bdaddr, cp->addr.type); - if (err < 0) + err = hci_bdaddr_list_add(&hdev->blacklist, &cp->addr.bdaddr, + cp->addr.type); + if (err < 0) { status = MGMT_STATUS_FAILED; - else - status = MGMT_STATUS_SUCCESS; + goto done; + } + + mgmt_event(MGMT_EV_DEVICE_BLOCKED, hdev, &cp->addr, sizeof(cp->addr), + sk); + status = MGMT_STATUS_SUCCESS; +done: err = cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, status, &cp->addr, sizeof(cp->addr)); @@ -3753,12 +4025,18 @@ static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - err = hci_blacklist_del(hdev, &cp->addr.bdaddr, cp->addr.type); - if (err < 0) + err = hci_bdaddr_list_del(&hdev->blacklist, &cp->addr.bdaddr, + cp->addr.type); + if (err < 0) { status = MGMT_STATUS_INVALID_PARAMS; - else - status = MGMT_STATUS_SUCCESS; + goto done; + } + mgmt_event(MGMT_EV_DEVICE_UNBLOCKED, hdev, &cp->addr, sizeof(cp->addr), + sk); + status = MGMT_STATUS_SUCCESS; + +done: err = cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, status, &cp->addr, sizeof(cp->addr)); @@ -3813,6 +4091,11 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status) return; } + if (test_bit(HCI_LE_ADV, &hdev->dev_flags)) + set_bit(HCI_ADVERTISING, &hdev->dev_flags); + else + clear_bit(HCI_ADVERTISING, &hdev->dev_flags); + mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, settings_rsp, &match); @@ -3853,7 +4136,9 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, * necessary). */ if (!hdev_is_powered(hdev) || val == enabled || - hci_conn_num(hdev, LE_LINK) > 0) { + hci_conn_num(hdev, LE_LINK) > 0 || + (test_bit(HCI_LE_SCAN, &hdev->dev_flags) && + hdev->le_scan_type == LE_SCAN_ACTIVE)) { bool changed = false; if (val != test_bit(HCI_ADVERTISING, &hdev->dev_flags)) { @@ -4094,26 +4379,6 @@ unlock: return err; } -static void set_bredr_scan(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - u8 scan = 0; - - /* Ensure that fast connectable is disabled. This function will - * not do anything if the page scan parameters are already what - * they should be. - */ - write_fast_connectable(req, false); - - if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags)) - scan |= SCAN_PAGE; - if (test_bit(HCI_DISCOVERABLE, &hdev->dev_flags)) - scan |= SCAN_INQUIRY; - - if (scan) - hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); -} - static void set_bredr_complete(struct hci_dev *hdev, u8 status) { struct pending_cmd *cmd; @@ -4219,8 +4484,8 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) hci_req_init(&req, hdev); - if (test_bit(HCI_CONNECTABLE, &hdev->dev_flags)) - set_bredr_scan(&req); + write_fast_connectable(&req, false); + hci_update_page_scan(hdev, &req); /* Since only the advertising data flags will change, there * is no need to update the scan response data. @@ -4252,7 +4517,7 @@ static int set_secure_conn(struct sock *sk, struct hci_dev *hdev, status); if (!lmp_sc_capable(hdev) && - !test_bit(HCI_FORCE_SC, &hdev->dev_flags)) + !test_bit(HCI_FORCE_SC, &hdev->dbg_flags)) return cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, MGMT_STATUS_NOT_SUPPORTED); @@ -4328,21 +4593,37 @@ static int set_debug_keys(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; - bool changed; + bool changed, use_changed; int err; BT_DBG("request for %s", hdev->name); - if (cp->val != 0x00 && cp->val != 0x01) + if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) return cmd_status(sk, hdev->id, MGMT_OP_SET_DEBUG_KEYS, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (cp->val) - changed = !test_and_set_bit(HCI_DEBUG_KEYS, &hdev->dev_flags); + changed = !test_and_set_bit(HCI_KEEP_DEBUG_KEYS, + &hdev->dev_flags); else - changed = test_and_clear_bit(HCI_DEBUG_KEYS, &hdev->dev_flags); + changed = test_and_clear_bit(HCI_KEEP_DEBUG_KEYS, + &hdev->dev_flags); + + if (cp->val == 0x02) + use_changed = !test_and_set_bit(HCI_USE_DEBUG_KEYS, + &hdev->dev_flags); + else + use_changed = test_and_clear_bit(HCI_USE_DEBUG_KEYS, + &hdev->dev_flags); + + if (hdev_is_powered(hdev) && use_changed && + test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) { + u8 mode = (cp->val == 0x02) ? 0x01 : 0x00; + hci_send_cmd(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE, + sizeof(mode), &mode); + } err = send_settings_rsp(sk, MGMT_OP_SET_DEBUG_KEYS, hdev); if (err < 0) @@ -4426,6 +4707,8 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, u16 len) { struct mgmt_cp_load_irks *cp = cp_data; + const u16 max_irk_count = ((U16_MAX - sizeof(*cp)) / + sizeof(struct mgmt_irk_info)); u16 irk_count, expected_len; int i, err; @@ -4436,6 +4719,11 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, MGMT_STATUS_NOT_SUPPORTED); irk_count = __le16_to_cpu(cp->irk_count); + if (irk_count > max_irk_count) { + BT_ERR("load_irks: too big irk_count value %u", irk_count); + return cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, + MGMT_STATUS_INVALID_PARAMS); + } expected_len = sizeof(*cp) + irk_count * sizeof(struct mgmt_irk_info); if (expected_len != len) { @@ -4505,6 +4793,8 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, void *cp_data, u16 len) { struct mgmt_cp_load_long_term_keys *cp = cp_data; + const u16 max_key_count = ((U16_MAX - sizeof(*cp)) / + sizeof(struct mgmt_ltk_info)); u16 key_count, expected_len; int i, err; @@ -4515,6 +4805,11 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, MGMT_STATUS_NOT_SUPPORTED); key_count = __le16_to_cpu(cp->key_count); + if (key_count > max_key_count) { + BT_ERR("load_ltks: too big key_count value %u", key_count); + return cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, + MGMT_STATUS_INVALID_PARAMS); + } expected_len = sizeof(*cp) + key_count * sizeof(struct mgmt_ltk_info); @@ -4550,9 +4845,9 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, addr_type = ADDR_LE_DEV_RANDOM; if (key->master) - type = HCI_SMP_LTK; + type = SMP_LTK; else - type = HCI_SMP_LTK_SLAVE; + type = SMP_LTK_SLAVE; switch (key->type) { case MGMT_LTK_UNAUTHENTICATED: @@ -4616,6 +4911,7 @@ static void get_conn_info_complete(struct pending_cmd *cmd, void *data) match->mgmt_status, &rp, sizeof(rp)); hci_conn_drop(conn); + hci_conn_put(conn); mgmt_pending_remove(cmd); } @@ -4772,7 +5068,7 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, } hci_conn_hold(conn); - cmd->user_data = conn; + cmd->user_data = hci_conn_get(conn); conn->conn_info_timestamp = jiffies; } else { @@ -4790,6 +5086,536 @@ unlock: return err; } +static void get_clock_info_complete(struct hci_dev *hdev, u8 status) +{ + struct mgmt_cp_get_clock_info *cp; + struct mgmt_rp_get_clock_info rp; + struct hci_cp_read_clock *hci_cp; + struct pending_cmd *cmd; + struct hci_conn *conn; + + BT_DBG("%s status %u", hdev->name, status); + + hci_dev_lock(hdev); + + hci_cp = hci_sent_cmd_data(hdev, HCI_OP_READ_CLOCK); + if (!hci_cp) + goto unlock; + + if (hci_cp->which) { + u16 handle = __le16_to_cpu(hci_cp->handle); + conn = hci_conn_hash_lookup_handle(hdev, handle); + } else { + conn = NULL; + } + + cmd = mgmt_pending_find_data(MGMT_OP_GET_CLOCK_INFO, hdev, conn); + if (!cmd) + goto unlock; + + cp = cmd->param; + + memset(&rp, 0, sizeof(rp)); + memcpy(&rp.addr, &cp->addr, sizeof(rp.addr)); + + if (status) + goto send_rsp; + + rp.local_clock = cpu_to_le32(hdev->clock); + + if (conn) { + rp.piconet_clock = cpu_to_le32(conn->clock); + rp.accuracy = cpu_to_le16(conn->clock_accuracy); + } + +send_rsp: + cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(status), + &rp, sizeof(rp)); + mgmt_pending_remove(cmd); + if (conn) { + hci_conn_drop(conn); + hci_conn_put(conn); + } + +unlock: + hci_dev_unlock(hdev); +} + +static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data, + u16 len) +{ + struct mgmt_cp_get_clock_info *cp = data; + struct mgmt_rp_get_clock_info rp; + struct hci_cp_read_clock hci_cp; + struct pending_cmd *cmd; + struct hci_request req; + struct hci_conn *conn; + int err; + + BT_DBG("%s", hdev->name); + + memset(&rp, 0, sizeof(rp)); + bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); + rp.addr.type = cp->addr.type; + + if (cp->addr.type != BDADDR_BREDR) + return cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO, + MGMT_STATUS_INVALID_PARAMS, + &rp, sizeof(rp)); + + hci_dev_lock(hdev); + + if (!hdev_is_powered(hdev)) { + err = cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO, + MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); + goto unlock; + } + + if (bacmp(&cp->addr.bdaddr, BDADDR_ANY)) { + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, + &cp->addr.bdaddr); + if (!conn || conn->state != BT_CONNECTED) { + err = cmd_complete(sk, hdev->id, + MGMT_OP_GET_CLOCK_INFO, + MGMT_STATUS_NOT_CONNECTED, + &rp, sizeof(rp)); + goto unlock; + } + } else { + conn = NULL; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_GET_CLOCK_INFO, hdev, data, len); + if (!cmd) { + err = -ENOMEM; + goto unlock; + } + + hci_req_init(&req, hdev); + + memset(&hci_cp, 0, sizeof(hci_cp)); + hci_req_add(&req, HCI_OP_READ_CLOCK, sizeof(hci_cp), &hci_cp); + + if (conn) { + hci_conn_hold(conn); + cmd->user_data = hci_conn_get(conn); + + hci_cp.handle = cpu_to_le16(conn->handle); + hci_cp.which = 0x01; /* Piconet clock */ + hci_req_add(&req, HCI_OP_READ_CLOCK, sizeof(hci_cp), &hci_cp); + } + + err = hci_req_run(&req, get_clock_info_complete); + if (err < 0) + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); + return err; +} + +static void device_added(struct sock *sk, struct hci_dev *hdev, + bdaddr_t *bdaddr, u8 type, u8 action) +{ + struct mgmt_ev_device_added ev; + + bacpy(&ev.addr.bdaddr, bdaddr); + ev.addr.type = type; + ev.action = action; + + mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk); +} + +static int add_device(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_add_device *cp = data; + u8 auto_conn, addr_type; + int err; + + BT_DBG("%s", hdev->name); + + if (!bdaddr_type_is_valid(cp->addr.type) || + !bacmp(&cp->addr.bdaddr, BDADDR_ANY)) + return cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); + + if (cp->action != 0x00 && cp->action != 0x01 && cp->action != 0x02) + return cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); + + hci_dev_lock(hdev); + + if (cp->addr.type == BDADDR_BREDR) { + /* Only incoming connections action is supported for now */ + if (cp->action != 0x01) { + err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); + goto unlock; + } + + err = hci_bdaddr_list_add(&hdev->whitelist, &cp->addr.bdaddr, + cp->addr.type); + if (err) + goto unlock; + + hci_update_page_scan(hdev, NULL); + + goto added; + } + + if (cp->addr.type == BDADDR_LE_PUBLIC) + addr_type = ADDR_LE_DEV_PUBLIC; + else + addr_type = ADDR_LE_DEV_RANDOM; + + if (cp->action == 0x02) + auto_conn = HCI_AUTO_CONN_ALWAYS; + else if (cp->action == 0x01) + auto_conn = HCI_AUTO_CONN_DIRECT; + else + auto_conn = HCI_AUTO_CONN_REPORT; + + /* If the connection parameters don't exist for this device, + * they will be created and configured with defaults. + */ + if (hci_conn_params_set(hdev, &cp->addr.bdaddr, addr_type, + auto_conn) < 0) { + err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_FAILED, + &cp->addr, sizeof(cp->addr)); + goto unlock; + } + +added: + device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); + + err = cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_SUCCESS, &cp->addr, sizeof(cp->addr)); + +unlock: + hci_dev_unlock(hdev); + return err; +} + +static void device_removed(struct sock *sk, struct hci_dev *hdev, + bdaddr_t *bdaddr, u8 type) +{ + struct mgmt_ev_device_removed ev; + + bacpy(&ev.addr.bdaddr, bdaddr); + ev.addr.type = type; + + mgmt_event(MGMT_EV_DEVICE_REMOVED, hdev, &ev, sizeof(ev), sk); +} + +static int remove_device(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_remove_device *cp = data; + int err; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + if (bacmp(&cp->addr.bdaddr, BDADDR_ANY)) { + struct hci_conn_params *params; + u8 addr_type; + + if (!bdaddr_type_is_valid(cp->addr.type)) { + err = cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); + goto unlock; + } + + if (cp->addr.type == BDADDR_BREDR) { + err = hci_bdaddr_list_del(&hdev->whitelist, + &cp->addr.bdaddr, + cp->addr.type); + if (err) { + err = cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); + goto unlock; + } + + hci_update_page_scan(hdev, NULL); + + device_removed(sk, hdev, &cp->addr.bdaddr, + cp->addr.type); + goto complete; + } + + if (cp->addr.type == BDADDR_LE_PUBLIC) + addr_type = ADDR_LE_DEV_PUBLIC; + else + addr_type = ADDR_LE_DEV_RANDOM; + + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + addr_type); + if (!params) { + err = cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); + goto unlock; + } + + if (params->auto_connect == HCI_AUTO_CONN_DISABLED) { + err = cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); + goto unlock; + } + + list_del(¶ms->action); + list_del(¶ms->list); + kfree(params); + hci_update_background_scan(hdev); + + device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type); + } else { + struct hci_conn_params *p, *tmp; + struct bdaddr_list *b, *btmp; + + if (cp->addr.type) { + err = cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); + goto unlock; + } + + list_for_each_entry_safe(b, btmp, &hdev->whitelist, list) { + device_removed(sk, hdev, &b->bdaddr, b->bdaddr_type); + list_del(&b->list); + kfree(b); + } + + hci_update_page_scan(hdev, NULL); + + list_for_each_entry_safe(p, tmp, &hdev->le_conn_params, list) { + if (p->auto_connect == HCI_AUTO_CONN_DISABLED) + continue; + device_removed(sk, hdev, &p->addr, p->addr_type); + list_del(&p->action); + list_del(&p->list); + kfree(p); + } + + BT_DBG("All LE connection parameters were removed"); + + hci_update_background_scan(hdev); + } + +complete: + err = cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_SUCCESS, &cp->addr, sizeof(cp->addr)); + +unlock: + hci_dev_unlock(hdev); + return err; +} + +static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, + u16 len) +{ + struct mgmt_cp_load_conn_param *cp = data; + const u16 max_param_count = ((U16_MAX - sizeof(*cp)) / + sizeof(struct mgmt_conn_param)); + u16 param_count, expected_len; + int i; + + if (!lmp_le_capable(hdev)) + return cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, + MGMT_STATUS_NOT_SUPPORTED); + + param_count = __le16_to_cpu(cp->param_count); + if (param_count > max_param_count) { + BT_ERR("load_conn_param: too big param_count value %u", + param_count); + return cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, + MGMT_STATUS_INVALID_PARAMS); + } + + expected_len = sizeof(*cp) + param_count * + sizeof(struct mgmt_conn_param); + if (expected_len != len) { + BT_ERR("load_conn_param: expected %u bytes, got %u bytes", + expected_len, len); + return cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, + MGMT_STATUS_INVALID_PARAMS); + } + + BT_DBG("%s param_count %u", hdev->name, param_count); + + hci_dev_lock(hdev); + + hci_conn_params_clear_disabled(hdev); + + for (i = 0; i < param_count; i++) { + struct mgmt_conn_param *param = &cp->params[i]; + struct hci_conn_params *hci_param; + u16 min, max, latency, timeout; + u8 addr_type; + + BT_DBG("Adding %pMR (type %u)", ¶m->addr.bdaddr, + param->addr.type); + + if (param->addr.type == BDADDR_LE_PUBLIC) { + addr_type = ADDR_LE_DEV_PUBLIC; + } else if (param->addr.type == BDADDR_LE_RANDOM) { + addr_type = ADDR_LE_DEV_RANDOM; + } else { + BT_ERR("Ignoring invalid connection parameters"); + continue; + } + + min = le16_to_cpu(param->min_interval); + max = le16_to_cpu(param->max_interval); + latency = le16_to_cpu(param->latency); + timeout = le16_to_cpu(param->timeout); + + BT_DBG("min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x", + min, max, latency, timeout); + + if (hci_check_conn_params(min, max, latency, timeout) < 0) { + BT_ERR("Ignoring invalid connection parameters"); + continue; + } + + hci_param = hci_conn_params_add(hdev, ¶m->addr.bdaddr, + addr_type); + if (!hci_param) { + BT_ERR("Failed to add connection parameters"); + continue; + } + + hci_param->conn_min_interval = min; + hci_param->conn_max_interval = max; + hci_param->conn_latency = latency; + hci_param->supervision_timeout = timeout; + } + + hci_dev_unlock(hdev); + + return cmd_complete(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, 0, NULL, 0); +} + +static int set_external_config(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_set_external_config *cp = data; + bool changed; + int err; + + BT_DBG("%s", hdev->name); + + if (hdev_is_powered(hdev)) + return cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, + MGMT_STATUS_REJECTED); + + if (cp->config != 0x00 && cp->config != 0x01) + return cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, + MGMT_STATUS_INVALID_PARAMS); + + if (!test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks)) + return cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, + MGMT_STATUS_NOT_SUPPORTED); + + hci_dev_lock(hdev); + + if (cp->config) + changed = !test_and_set_bit(HCI_EXT_CONFIGURED, + &hdev->dev_flags); + else + changed = test_and_clear_bit(HCI_EXT_CONFIGURED, + &hdev->dev_flags); + + err = send_options_rsp(sk, MGMT_OP_SET_EXTERNAL_CONFIG, hdev); + if (err < 0) + goto unlock; + + if (!changed) + goto unlock; + + err = new_options(hdev, sk); + + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) == is_configured(hdev)) { + mgmt_index_removed(hdev); + + if (test_and_change_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) { + set_bit(HCI_CONFIG, &hdev->dev_flags); + set_bit(HCI_AUTO_OFF, &hdev->dev_flags); + + queue_work(hdev->req_workqueue, &hdev->power_on); + } else { + set_bit(HCI_RAW, &hdev->flags); + mgmt_index_added(hdev); + } + } + +unlock: + hci_dev_unlock(hdev); + return err; +} + +static int set_public_address(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_set_public_address *cp = data; + bool changed; + int err; + + BT_DBG("%s", hdev->name); + + if (hdev_is_powered(hdev)) + return cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, + MGMT_STATUS_REJECTED); + + if (!bacmp(&cp->bdaddr, BDADDR_ANY)) + return cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, + MGMT_STATUS_INVALID_PARAMS); + + if (!hdev->set_bdaddr) + return cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, + MGMT_STATUS_NOT_SUPPORTED); + + hci_dev_lock(hdev); + + changed = !!bacmp(&hdev->public_addr, &cp->bdaddr); + bacpy(&hdev->public_addr, &cp->bdaddr); + + err = send_options_rsp(sk, MGMT_OP_SET_PUBLIC_ADDRESS, hdev); + if (err < 0) + goto unlock; + + if (!changed) + goto unlock; + + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + err = new_options(hdev, sk); + + if (is_configured(hdev)) { + mgmt_index_removed(hdev); + + clear_bit(HCI_UNCONFIGURED, &hdev->dev_flags); + + set_bit(HCI_CONFIG, &hdev->dev_flags); + set_bit(HCI_AUTO_OFF, &hdev->dev_flags); + + queue_work(hdev->req_workqueue, &hdev->power_on); + } + +unlock: + hci_dev_unlock(hdev); + return err; +} + static const struct mgmt_handler { int (*func) (struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len); @@ -4805,7 +5631,7 @@ static const struct mgmt_handler { { set_discoverable, false, MGMT_SET_DISCOVERABLE_SIZE }, { set_connectable, false, MGMT_SETTING_SIZE }, { set_fast_connectable, false, MGMT_SETTING_SIZE }, - { set_pairable, false, MGMT_SETTING_SIZE }, + { set_bondable, false, MGMT_SETTING_SIZE }, { set_link_security, false, MGMT_SETTING_SIZE }, { set_ssp, false, MGMT_SETTING_SIZE }, { set_hs, false, MGMT_SETTING_SIZE }, @@ -4846,9 +5672,16 @@ static const struct mgmt_handler { { set_privacy, false, MGMT_SET_PRIVACY_SIZE }, { load_irks, true, MGMT_LOAD_IRKS_SIZE }, { get_conn_info, false, MGMT_GET_CONN_INFO_SIZE }, + { get_clock_info, false, MGMT_GET_CLOCK_INFO_SIZE }, + { add_device, false, MGMT_ADD_DEVICE_SIZE }, + { remove_device, false, MGMT_REMOVE_DEVICE_SIZE }, + { load_conn_param, true, MGMT_LOAD_CONN_PARAM_SIZE }, + { read_unconf_index_list, false, MGMT_READ_UNCONF_INDEX_LIST_SIZE }, + { read_config_info, false, MGMT_READ_CONFIG_INFO_SIZE }, + { set_external_config, false, MGMT_SET_EXTERNAL_CONFIG_SIZE }, + { set_public_address, false, MGMT_SET_PUBLIC_ADDRESS_SIZE }, }; - int mgmt_control(struct sock *sk, struct msghdr *msg, size_t msglen) { void *buf; @@ -4892,11 +5725,21 @@ int mgmt_control(struct sock *sk, struct msghdr *msg, size_t msglen) } if (test_bit(HCI_SETUP, &hdev->dev_flags) || + test_bit(HCI_CONFIG, &hdev->dev_flags) || test_bit(HCI_USER_CHANNEL, &hdev->dev_flags)) { err = cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; } + + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags) && + opcode != MGMT_OP_READ_CONFIG_INFO && + opcode != MGMT_OP_SET_EXTERNAL_CONFIG && + opcode != MGMT_OP_SET_PUBLIC_ADDRESS) { + err = cmd_status(sk, index, opcode, + MGMT_STATUS_INVALID_INDEX); + goto done; + } } if (opcode >= ARRAY_SIZE(mgmt_handlers) || @@ -4907,8 +5750,15 @@ int mgmt_control(struct sock *sk, struct msghdr *msg, size_t msglen) goto done; } - if ((hdev && opcode < MGMT_OP_READ_INFO) || - (!hdev && opcode >= MGMT_OP_READ_INFO)) { + if (hdev && (opcode <= MGMT_OP_READ_INDEX_LIST || + opcode == MGMT_OP_READ_UNCONF_INDEX_LIST)) { + err = cmd_status(sk, index, opcode, + MGMT_STATUS_INVALID_INDEX); + goto done; + } + + if (!hdev && (opcode > MGMT_OP_READ_INDEX_LIST && + opcode != MGMT_OP_READ_UNCONF_INDEX_LIST)) { err = cmd_status(sk, index, opcode, MGMT_STATUS_INVALID_INDEX); goto done; @@ -4947,7 +5797,13 @@ void mgmt_index_added(struct hci_dev *hdev) if (hdev->dev_type != HCI_BREDR) return; - mgmt_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0, NULL); + if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + return; + + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + mgmt_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, NULL, 0, NULL); + else + mgmt_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0, NULL); } void mgmt_index_removed(struct hci_dev *hdev) @@ -4957,20 +5813,42 @@ void mgmt_index_removed(struct hci_dev *hdev) if (hdev->dev_type != HCI_BREDR) return; + if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + return; + mgmt_pending_foreach(0, hdev, cmd_status_rsp, &status); - mgmt_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0, NULL); + if (test_bit(HCI_UNCONFIGURED, &hdev->dev_flags)) + mgmt_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, NULL, 0, NULL); + else + mgmt_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0, NULL); } /* This function requires the caller holds hdev->lock */ -static void restart_le_auto_conns(struct hci_dev *hdev) +static void restart_le_actions(struct hci_dev *hdev) { struct hci_conn_params *p; list_for_each_entry(p, &hdev->le_conn_params, list) { - if (p->auto_connect == HCI_AUTO_CONN_ALWAYS) - hci_pend_le_conn_add(hdev, &p->addr, p->addr_type); + /* Needed for AUTO_OFF case where might not "really" + * have been powered off. + */ + list_del_init(&p->action); + + switch (p->auto_connect) { + case HCI_AUTO_CONN_DIRECT: + case HCI_AUTO_CONN_ALWAYS: + list_add(&p->action, &hdev->pend_le_conns); + break; + case HCI_AUTO_CONN_REPORT: + list_add(&p->action, &hdev->pend_le_reports); + break; + default: + break; + } } + + hci_update_background_scan(hdev); } static void powered_complete(struct hci_dev *hdev, u8 status) @@ -4981,7 +5859,7 @@ static void powered_complete(struct hci_dev *hdev, u8 status) hci_dev_lock(hdev); - restart_le_auto_conns(hdev); + restart_le_actions(hdev); mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); @@ -5011,8 +5889,8 @@ static int powered_update_hci(struct hci_dev *hdev) lmp_bredr_capable(hdev)) { struct hci_cp_write_le_host_supported cp; - cp.le = 1; - cp.simul = lmp_le_br_capable(hdev); + cp.le = 0x01; + cp.simul = 0x00; /* Check first if we already have the right * host state (host features set) @@ -5043,8 +5921,8 @@ static int powered_update_hci(struct hci_dev *hdev) sizeof(link_sec), &link_sec); if (lmp_bredr_capable(hdev)) { - if (test_bit(HCI_BREDR_ENABLED, &hdev->dev_flags)) - set_bredr_scan(&req); + write_fast_connectable(&req, false); + hci_update_page_scan(hdev, &req); update_class(&req); update_name(&req); update_eir(&req); @@ -5138,92 +6016,6 @@ void mgmt_discoverable_timeout(struct hci_dev *hdev) hci_dev_unlock(hdev); } -void mgmt_discoverable(struct hci_dev *hdev, u8 discoverable) -{ - bool changed; - - /* Nothing needed here if there's a pending command since that - * commands request completion callback takes care of everything - * necessary. - */ - if (mgmt_pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) - return; - - /* Powering off may clear the scan mode - don't let that interfere */ - if (!discoverable && mgmt_pending_find(MGMT_OP_SET_POWERED, hdev)) - return; - - if (discoverable) { - changed = !test_and_set_bit(HCI_DISCOVERABLE, &hdev->dev_flags); - } else { - clear_bit(HCI_LIMITED_DISCOVERABLE, &hdev->dev_flags); - changed = test_and_clear_bit(HCI_DISCOVERABLE, &hdev->dev_flags); - } - - if (changed) { - struct hci_request req; - - /* In case this change in discoverable was triggered by - * a disabling of connectable there could be a need to - * update the advertising flags. - */ - hci_req_init(&req, hdev); - update_adv_data(&req); - hci_req_run(&req, NULL); - - new_settings(hdev, NULL); - } -} - -void mgmt_connectable(struct hci_dev *hdev, u8 connectable) -{ - bool changed; - - /* Nothing needed here if there's a pending command since that - * commands request completion callback takes care of everything - * necessary. - */ - if (mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) - return; - - /* Powering off may clear the scan mode - don't let that interfere */ - if (!connectable && mgmt_pending_find(MGMT_OP_SET_POWERED, hdev)) - return; - - if (connectable) - changed = !test_and_set_bit(HCI_CONNECTABLE, &hdev->dev_flags); - else - changed = test_and_clear_bit(HCI_CONNECTABLE, &hdev->dev_flags); - - if (changed) - new_settings(hdev, NULL); -} - -void mgmt_advertising(struct hci_dev *hdev, u8 advertising) -{ - /* Powering off may stop advertising - don't let that interfere */ - if (!advertising && mgmt_pending_find(MGMT_OP_SET_POWERED, hdev)) - return; - - if (advertising) - set_bit(HCI_ADVERTISING, &hdev->dev_flags); - else - clear_bit(HCI_ADVERTISING, &hdev->dev_flags); -} - -void mgmt_write_scan_failed(struct hci_dev *hdev, u8 scan, u8 status) -{ - u8 mgmt_err = mgmt_status(status); - - if (scan & SCAN_PAGE) - mgmt_pending_foreach(MGMT_OP_SET_CONNECTABLE, hdev, - cmd_status_rsp, &mgmt_err); - - if (scan & SCAN_INQUIRY) - mgmt_pending_foreach(MGMT_OP_SET_DISCOVERABLE, hdev, - cmd_status_rsp, &mgmt_err); -} - void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, bool persistent) { @@ -5279,7 +6071,7 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) ev.key.ediv = key->ediv; ev.key.rand = key->rand; - if (key->type == HCI_SMP_LTK) + if (key->type == SMP_LTK) ev.key.master = 1; memcpy(ev.key.val, key->val, sizeof(key->val)); @@ -5347,6 +6139,27 @@ void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, mgmt_event(MGMT_EV_NEW_CSRK, hdev, &ev, sizeof(ev), NULL); } +void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 bdaddr_type, u8 store_hint, u16 min_interval, + u16 max_interval, u16 latency, u16 timeout) +{ + struct mgmt_ev_new_conn_param ev; + + if (!hci_is_identity_address(bdaddr, bdaddr_type)) + return; + + memset(&ev, 0, sizeof(ev)); + bacpy(&ev.addr.bdaddr, bdaddr); + ev.addr.type = link_to_bdaddr(LE_LINK, bdaddr_type); + ev.store_hint = store_hint; + ev.min_interval = cpu_to_le16(min_interval); + ev.max_interval = cpu_to_le16(max_interval); + ev.latency = cpu_to_le16(latency); + ev.timeout = cpu_to_le16(timeout); + + mgmt_event(MGMT_EV_NEW_CONN_PARAM, hdev, &ev, sizeof(ev), NULL); +} + static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, u8 data_len) { @@ -5420,25 +6233,35 @@ static void unpair_device_rsp(struct pending_cmd *cmd, void *data) mgmt_pending_remove(cmd); } +bool mgmt_powering_down(struct hci_dev *hdev) +{ + struct pending_cmd *cmd; + struct mgmt_mode *cp; + + cmd = mgmt_pending_find(MGMT_OP_SET_POWERED, hdev); + if (!cmd) + return false; + + cp = cmd->param; + if (!cp->val) + return true; + + return false; +} + void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 reason, bool mgmt_connected) { struct mgmt_ev_device_disconnected ev; - struct pending_cmd *power_off; struct sock *sk = NULL; - power_off = mgmt_pending_find(MGMT_OP_SET_POWERED, hdev); - if (power_off) { - struct mgmt_mode *cp = power_off->param; - - /* The connection is still in hci_conn_hash so test for 1 - * instead of 0 to know if this is the last one. - */ - if (!cp->val && hci_conn_count(hdev) == 1) { - cancel_delayed_work(&hdev->power_off); - queue_work(hdev->req_workqueue, &hdev->power_off.work); - } + /* The connection is still in hci_conn_hash so test for 1 + * instead of 0 to know if this is the last one. + */ + if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) { + cancel_delayed_work(&hdev->power_off); + queue_work(hdev->req_workqueue, &hdev->power_off.work); } if (!mgmt_connected) @@ -5498,19 +6321,13 @@ void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status) { struct mgmt_ev_connect_failed ev; - struct pending_cmd *power_off; - power_off = mgmt_pending_find(MGMT_OP_SET_POWERED, hdev); - if (power_off) { - struct mgmt_mode *cp = power_off->param; - - /* The connection is still in hci_conn_hash so test for 1 - * instead of 0 to know if this is the last one. - */ - if (!cp->val && hci_conn_count(hdev) == 1) { - cancel_delayed_work(&hdev->power_off); - queue_work(hdev->req_workqueue, &hdev->power_off.work); - } + /* The connection is still in hci_conn_hash so test for 1 + * instead of 0 to know if this is the last one. + */ + if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) { + cancel_delayed_work(&hdev->power_off); + queue_work(hdev->req_workqueue, &hdev->power_off.work); } bacpy(&ev.addr.bdaddr, bdaddr); @@ -5668,16 +6485,23 @@ int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr, return mgmt_event(MGMT_EV_PASSKEY_NOTIFY, hdev, &ev, sizeof(ev), NULL); } -void mgmt_auth_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, - u8 addr_type, u8 status) +void mgmt_auth_failed(struct hci_conn *conn, u8 hci_status) { struct mgmt_ev_auth_failed ev; + struct pending_cmd *cmd; + u8 status = mgmt_status(hci_status); - bacpy(&ev.addr.bdaddr, bdaddr); - ev.addr.type = link_to_bdaddr(link_type, addr_type); - ev.status = mgmt_status(status); + bacpy(&ev.addr.bdaddr, &conn->dst); + ev.addr.type = link_to_bdaddr(conn->type, conn->dst_type); + ev.status = status; + + cmd = find_pairing(conn); + + mgmt_event(MGMT_EV_AUTH_FAILED, conn->hdev, &ev, sizeof(ev), + cmd ? cmd->sk : NULL); - mgmt_event(MGMT_EV_AUTH_FAILED, hdev, &ev, sizeof(ev), NULL); + if (cmd) + pairing_complete(cmd, status); } void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) @@ -5765,10 +6589,14 @@ void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status) hci_req_init(&req, hdev); - if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) + if (test_bit(HCI_SSP_ENABLED, &hdev->dev_flags)) { + if (test_bit(HCI_USE_DEBUG_KEYS, &hdev->dev_flags)) + hci_req_add(&req, HCI_OP_WRITE_SSP_DEBUG_MODE, + sizeof(enable), &enable); update_eir(&req); - else + } else { clear_eir(&req); + } hci_req_run(&req, NULL); } @@ -5912,17 +6740,23 @@ void mgmt_read_local_oob_data_complete(struct hci_dev *hdev, u8 *hash192, } void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, - u8 addr_type, u8 *dev_class, s8 rssi, u8 cfm_name, - u8 ssp, u8 *eir, u16 eir_len, u8 *scan_rsp, - u8 scan_rsp_len) + u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, + u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len) { char buf[512]; struct mgmt_ev_device_found *ev = (void *) buf; - struct smp_irk *irk; size_t ev_size; - if (!hci_discovery_active(hdev)) - return; + /* Don't send events for a non-kernel initiated discovery. With + * LE one exception is if we have pend_le_reports > 0 in which + * case we're doing passive scanning and want these events. + */ + if (!hci_discovery_active(hdev)) { + if (link_type == ACL_LINK) + return; + if (link_type == LE_LINK && list_empty(&hdev->pend_le_reports)) + return; + } /* Make sure that the buffer is big enough. The 5 extra bytes * are for the potential CoD field. @@ -5932,20 +6766,10 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, memset(buf, 0, sizeof(buf)); - irk = hci_get_irk(hdev, bdaddr, addr_type); - if (irk) { - bacpy(&ev->addr.bdaddr, &irk->bdaddr); - ev->addr.type = link_to_bdaddr(link_type, irk->addr_type); - } else { - bacpy(&ev->addr.bdaddr, bdaddr); - ev->addr.type = link_to_bdaddr(link_type, addr_type); - } - + bacpy(&ev->addr.bdaddr, bdaddr); + ev->addr.type = link_to_bdaddr(link_type, addr_type); ev->rssi = rssi; - if (cfm_name) - ev->flags |= cpu_to_le32(MGMT_DEV_FOUND_CONFIRM_NAME); - if (!ssp) - ev->flags |= cpu_to_le32(MGMT_DEV_FOUND_LEGACY_PAIRING); + ev->flags = cpu_to_le32(flags); if (eir_len > 0) memcpy(ev->eir, eir, eir_len); @@ -6013,63 +6837,19 @@ void mgmt_discovering(struct hci_dev *hdev, u8 discovering) mgmt_event(MGMT_EV_DISCOVERING, hdev, &ev, sizeof(ev), NULL); } -int mgmt_device_blocked(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) -{ - struct pending_cmd *cmd; - struct mgmt_ev_device_blocked ev; - - cmd = mgmt_pending_find(MGMT_OP_BLOCK_DEVICE, hdev); - - bacpy(&ev.addr.bdaddr, bdaddr); - ev.addr.type = type; - - return mgmt_event(MGMT_EV_DEVICE_BLOCKED, hdev, &ev, sizeof(ev), - cmd ? cmd->sk : NULL); -} - -int mgmt_device_unblocked(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) -{ - struct pending_cmd *cmd; - struct mgmt_ev_device_unblocked ev; - - cmd = mgmt_pending_find(MGMT_OP_UNBLOCK_DEVICE, hdev); - - bacpy(&ev.addr.bdaddr, bdaddr); - ev.addr.type = type; - - return mgmt_event(MGMT_EV_DEVICE_UNBLOCKED, hdev, &ev, sizeof(ev), - cmd ? cmd->sk : NULL); -} - static void adv_enable_complete(struct hci_dev *hdev, u8 status) { BT_DBG("%s status %u", hdev->name, status); - - /* Clear the advertising mgmt setting if we failed to re-enable it */ - if (status) { - clear_bit(HCI_ADVERTISING, &hdev->dev_flags); - new_settings(hdev, NULL); - } } void mgmt_reenable_advertising(struct hci_dev *hdev) { struct hci_request req; - if (hci_conn_num(hdev, LE_LINK) > 0) - return; - if (!test_bit(HCI_ADVERTISING, &hdev->dev_flags)) return; hci_req_init(&req, hdev); enable_advertising(&req); - - /* If this fails we have no option but to let user space know - * that we've disabled advertising. - */ - if (hci_req_run(&req, adv_enable_complete) < 0) { - clear_bit(HCI_ADVERTISING, &hdev->dev_flags); - new_settings(hdev, NULL); - } + hci_req_run(&req, adv_enable_complete); } diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 754b6fe4f742..af73bc3acb40 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -227,7 +227,8 @@ static int rfcomm_check_security(struct rfcomm_dlc *d) break; } - return hci_conn_security(conn->hcon, d->sec_level, auth_type); + return hci_conn_security(conn->hcon, d->sec_level, auth_type, + d->out); } static void rfcomm_session_timeout(unsigned long arg) @@ -1909,10 +1910,13 @@ static struct rfcomm_session *rfcomm_process_rx(struct rfcomm_session *s) /* Get data directly from socket receive queue without copying it. */ while ((skb = skb_dequeue(&sk->sk_receive_queue))) { skb_orphan(skb); - if (!skb_linearize(skb)) + if (!skb_linearize(skb)) { s = rfcomm_recv_frame(s, skb); - else + if (!s) + break; + } else { kfree_skb(skb); + } } if (s && (sk->sk_state == BT_CLOSED)) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index c603a5eb4720..8bbbb5ec468c 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -918,7 +918,8 @@ static int rfcomm_sock_shutdown(struct socket *sock, int how) sk->sk_shutdown = SHUTDOWN_MASK; __rfcomm_sock_close(sk); - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && + !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); } release_sock(sk); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index c06dbd3938e8..7ee9e4ab00f8 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -40,13 +40,38 @@ static struct bt_sock_list sco_sk_list = { .lock = __RW_LOCK_UNLOCKED(sco_sk_list.lock) }; -static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent); -static void sco_chan_del(struct sock *sk, int err); +/* ---- SCO connections ---- */ +struct sco_conn { + struct hci_conn *hcon; + + spinlock_t lock; + struct sock *sk; + + unsigned int mtu; +}; + +#define sco_conn_lock(c) spin_lock(&c->lock); +#define sco_conn_unlock(c) spin_unlock(&c->lock); static void sco_sock_close(struct sock *sk); static void sco_sock_kill(struct sock *sk); +/* ----- SCO socket info ----- */ +#define sco_pi(sk) ((struct sco_pinfo *) sk) + +struct sco_pinfo { + struct bt_sock bt; + bdaddr_t src; + bdaddr_t dst; + __u32 flags; + __u16 setting; + struct sco_conn *conn; +}; + /* ---- SCO timers ---- */ +#define SCO_CONN_TIMEOUT (HZ * 40) +#define SCO_DISCONN_TIMEOUT (HZ * 2) + static void sco_sock_timeout(unsigned long arg) { struct sock *sk = (struct sock *) arg; @@ -102,13 +127,31 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) return conn; } -static struct sock *sco_chan_get(struct sco_conn *conn) +/* Delete channel. + * Must be called on the locked socket. */ +static void sco_chan_del(struct sock *sk, int err) { - struct sock *sk = NULL; - sco_conn_lock(conn); - sk = conn->sk; - sco_conn_unlock(conn); - return sk; + struct sco_conn *conn; + + conn = sco_pi(sk)->conn; + + BT_DBG("sk %p, conn %p, err %d", sk, conn, err); + + if (conn) { + sco_conn_lock(conn); + conn->sk = NULL; + sco_pi(sk)->conn = NULL; + sco_conn_unlock(conn); + + if (conn->hcon) + hci_conn_drop(conn->hcon); + } + + sk->sk_state = BT_CLOSED; + sk->sk_err = err; + sk->sk_state_change(sk); + + sock_set_flag(sk, SOCK_ZAPPED); } static int sco_conn_del(struct hci_conn *hcon, int err) @@ -122,7 +165,10 @@ static int sco_conn_del(struct hci_conn *hcon, int err) BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); /* Kill socket */ - sk = sco_chan_get(conn); + sco_conn_lock(conn); + sk = conn->sk; + sco_conn_unlock(conn); + if (sk) { bh_lock_sock(sk); sco_sock_clear_timer(sk); @@ -136,6 +182,17 @@ static int sco_conn_del(struct hci_conn *hcon, int err) return 0; } +static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) +{ + BT_DBG("conn %p", conn); + + sco_pi(sk)->conn = conn; + conn->sk = sk; + + if (parent) + bt_accept_enqueue(parent, sk); +} + static int sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) { @@ -240,7 +297,11 @@ static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb) { - struct sock *sk = sco_chan_get(conn); + struct sock *sk; + + sco_conn_lock(conn); + sk = conn->sk; + sco_conn_unlock(conn); if (!sk) goto drop; @@ -909,7 +970,8 @@ static int sco_sock_shutdown(struct socket *sock, int how) sco_sock_clear_timer(sk); __sco_sock_close(sk); - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && + !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); } @@ -929,7 +991,8 @@ static int sco_sock_release(struct socket *sock) sco_sock_close(sk); - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) { + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && + !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); release_sock(sk); @@ -940,44 +1003,6 @@ static int sco_sock_release(struct socket *sock) return err; } -static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) -{ - BT_DBG("conn %p", conn); - - sco_pi(sk)->conn = conn; - conn->sk = sk; - - if (parent) - bt_accept_enqueue(parent, sk); -} - -/* Delete channel. - * Must be called on the locked socket. */ -static void sco_chan_del(struct sock *sk, int err) -{ - struct sco_conn *conn; - - conn = sco_pi(sk)->conn; - - BT_DBG("sk %p, conn %p, err %d", sk, conn, err); - - if (conn) { - sco_conn_lock(conn); - conn->sk = NULL; - sco_pi(sk)->conn = NULL; - sco_conn_unlock(conn); - - if (conn->hcon) - hci_conn_drop(conn->hcon); - } - - sk->sk_state = BT_CLOSED; - sk->sk_err = err; - sk->sk_state_change(sk); - - sock_set_flag(sk, SOCK_ZAPPED); -} - static void sco_conn_ready(struct sco_conn *conn) { struct sock *parent; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index e33a982161c1..f09b6b65cf6b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -31,18 +31,26 @@ #include "smp.h" +#define SMP_ALLOW_CMD(smp, code) set_bit(code, &smp->allow_cmd) + #define SMP_TIMEOUT msecs_to_jiffies(30000) #define AUTH_REQ_MASK 0x07 - -#define SMP_FLAG_TK_VALID 1 -#define SMP_FLAG_CFM_PENDING 2 -#define SMP_FLAG_MITM_AUTH 3 -#define SMP_FLAG_COMPLETE 4 -#define SMP_FLAG_INITIATOR 5 +#define KEY_DIST_MASK 0x07 + +enum { + SMP_FLAG_TK_VALID, + SMP_FLAG_CFM_PENDING, + SMP_FLAG_MITM_AUTH, + SMP_FLAG_COMPLETE, + SMP_FLAG_INITIATOR, +}; struct smp_chan { - struct l2cap_conn *conn; + struct l2cap_conn *conn; + struct delayed_work security_timer; + unsigned long allow_cmd; /* Bitmask of allowed commands */ + u8 preq[7]; /* SMP Pairing Request */ u8 prsp[7]; /* SMP Pairing Response */ u8 prnd[16]; /* SMP Pairing Random (local) */ @@ -60,20 +68,16 @@ struct smp_chan { struct smp_ltk *slave_ltk; struct smp_irk *remote_irk; unsigned long flags; + + struct crypto_blkcipher *tfm_aes; }; -static inline void swap128(const u8 src[16], u8 dst[16]) +static inline void swap_buf(const u8 *src, u8 *dst, size_t len) { - int i; - for (i = 0; i < 16; i++) - dst[15 - i] = src[i]; -} + size_t i; -static inline void swap56(const u8 src[7], u8 dst[7]) -{ - int i; - for (i = 0; i < 7; i++) - dst[6 - i] = src[i]; + for (i = 0; i < len; i++) + dst[len - 1 - i] = src[i]; } static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) @@ -92,7 +96,7 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) desc.flags = 0; /* The most significant octet of key corresponds to k[0] */ - swap128(k, tmp); + swap_buf(k, tmp, 16); err = crypto_blkcipher_setkey(tfm, tmp, 16); if (err) { @@ -101,7 +105,7 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) } /* Most significant octet of plaintextData corresponds to data[0] */ - swap128(r, data); + swap_buf(r, data, 16); sg_init_one(&sg, data, 16); @@ -110,7 +114,7 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r) BT_ERR("Encrypt data error %d", err); /* Most significant octet of encryptedData corresponds to data[0] */ - swap128(data, r); + swap_buf(data, r, 16); return err; } @@ -141,12 +145,18 @@ static int smp_ah(struct crypto_blkcipher *tfm, u8 irk[16], u8 r[3], u8 res[3]) return 0; } -bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], - bdaddr_t *bdaddr) +bool smp_irk_matches(struct hci_dev *hdev, u8 irk[16], bdaddr_t *bdaddr) { + struct l2cap_chan *chan = hdev->smp_data; + struct crypto_blkcipher *tfm; u8 hash[3]; int err; + if (!chan || !chan->data) + return false; + + tfm = chan->data; + BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk); err = smp_ah(tfm, irk, &bdaddr->b[3], hash); @@ -156,10 +166,17 @@ bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], return !memcmp(bdaddr->b, hash, 3); } -int smp_generate_rpa(struct crypto_blkcipher *tfm, u8 irk[16], bdaddr_t *rpa) +int smp_generate_rpa(struct hci_dev *hdev, u8 irk[16], bdaddr_t *rpa) { + struct l2cap_chan *chan = hdev->smp_data; + struct crypto_blkcipher *tfm; int err; + if (!chan || !chan->data) + return -EOPNOTSUPP; + + tfm = chan->data; + get_random_bytes(&rpa->b[3], 3); rpa->b[5] &= 0x3f; /* Clear two most significant bits */ @@ -174,13 +191,16 @@ int smp_generate_rpa(struct crypto_blkcipher *tfm, u8 irk[16], bdaddr_t *rpa) return 0; } -static int smp_c1(struct crypto_blkcipher *tfm, u8 k[16], u8 r[16], - u8 preq[7], u8 pres[7], u8 _iat, bdaddr_t *ia, - u8 _rat, bdaddr_t *ra, u8 res[16]) +static int smp_c1(struct smp_chan *smp, u8 k[16], u8 r[16], u8 preq[7], + u8 pres[7], u8 _iat, bdaddr_t *ia, u8 _rat, bdaddr_t *ra, + u8 res[16]) { + struct hci_dev *hdev = smp->conn->hcon->hdev; u8 p1[16], p2[16]; int err; + BT_DBG("%s", hdev->name); + memset(p1, 0, 16); /* p1 = pres || preq || _rat || _iat */ @@ -198,7 +218,7 @@ static int smp_c1(struct crypto_blkcipher *tfm, u8 k[16], u8 r[16], u128_xor((u128 *) res, (u128 *) r, (u128 *) p1); /* res = e(k, res) */ - err = smp_e(tfm, k, res); + err = smp_e(smp->tfm_aes, k, res); if (err) { BT_ERR("Encrypt data error"); return err; @@ -208,70 +228,64 @@ static int smp_c1(struct crypto_blkcipher *tfm, u8 k[16], u8 r[16], u128_xor((u128 *) res, (u128 *) res, (u128 *) p2); /* res = e(k, res) */ - err = smp_e(tfm, k, res); + err = smp_e(smp->tfm_aes, k, res); if (err) BT_ERR("Encrypt data error"); return err; } -static int smp_s1(struct crypto_blkcipher *tfm, u8 k[16], u8 r1[16], - u8 r2[16], u8 _r[16]) +static int smp_s1(struct smp_chan *smp, u8 k[16], u8 r1[16], u8 r2[16], + u8 _r[16]) { + struct hci_dev *hdev = smp->conn->hcon->hdev; int err; + BT_DBG("%s", hdev->name); + /* Just least significant octets from r1 and r2 are considered */ memcpy(_r, r2, 8); memcpy(_r + 8, r1, 8); - err = smp_e(tfm, k, _r); + err = smp_e(smp->tfm_aes, k, _r); if (err) BT_ERR("Encrypt data error"); return err; } -static struct sk_buff *smp_build_cmd(struct l2cap_conn *conn, u8 code, - u16 dlen, void *data) +static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) { - struct sk_buff *skb; - struct l2cap_hdr *lh; - int len; - - len = L2CAP_HDR_SIZE + sizeof(code) + dlen; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp; + struct kvec iv[2]; + struct msghdr msg; - if (len > conn->mtu) - return NULL; + if (!chan) + return; - skb = bt_skb_alloc(len, GFP_ATOMIC); - if (!skb) - return NULL; + BT_DBG("code 0x%2.2x", code); - lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); - lh->len = cpu_to_le16(sizeof(code) + dlen); - lh->cid = cpu_to_le16(L2CAP_CID_SMP); + iv[0].iov_base = &code; + iv[0].iov_len = 1; - memcpy(skb_put(skb, sizeof(code)), &code, sizeof(code)); + iv[1].iov_base = data; + iv[1].iov_len = len; - memcpy(skb_put(skb, dlen), data, dlen); + memset(&msg, 0, sizeof(msg)); - return skb; -} + msg.msg_iov = (struct iovec *) &iv; + msg.msg_iovlen = 2; -static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) -{ - struct sk_buff *skb = smp_build_cmd(conn, code, len, data); - - BT_DBG("code 0x%2.2x", code); + l2cap_chan_send(chan, &msg, 1 + len); - if (!skb) + if (!chan->data) return; - skb->priority = HCI_PRIO_MAX; - hci_send_acl(conn->hchan, skb, 0); + smp = chan->data; - cancel_delayed_work_sync(&conn->security_timer); - schedule_delayed_work(&conn->security_timer, SMP_TIMEOUT); + cancel_delayed_work_sync(&smp->security_timer); + schedule_delayed_work(&smp->security_timer, SMP_TIMEOUT); } static __u8 authreq_to_seclevel(__u8 authreq) @@ -298,12 +312,13 @@ static void build_pairing_cmd(struct l2cap_conn *conn, struct smp_cmd_pairing *req, struct smp_cmd_pairing *rsp, __u8 authreq) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; u8 local_dist = 0, remote_dist = 0; - if (test_bit(HCI_PAIRABLE, &conn->hcon->hdev->dev_flags)) { + if (test_bit(HCI_BONDABLE, &conn->hcon->hdev->dev_flags)) { local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; authreq |= SMP_AUTH_BONDING; @@ -341,7 +356,8 @@ static void build_pairing_cmd(struct l2cap_conn *conn, static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; if ((max_key_size > SMP_MAX_ENC_KEY_SIZE) || (max_key_size < SMP_MIN_ENC_KEY_SIZE)) @@ -352,21 +368,60 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) return 0; } +static void smp_chan_destroy(struct l2cap_conn *conn) +{ + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; + bool complete; + + BUG_ON(!smp); + + cancel_delayed_work_sync(&smp->security_timer); + + complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); + mgmt_smp_complete(conn->hcon, complete); + + kfree(smp->csrk); + kfree(smp->slave_csrk); + + crypto_free_blkcipher(smp->tfm_aes); + + /* If pairing failed clean up any keys we might have */ + if (!complete) { + if (smp->ltk) { + list_del(&smp->ltk->list); + kfree(smp->ltk); + } + + if (smp->slave_ltk) { + list_del(&smp->slave_ltk->list); + kfree(smp->slave_ltk); + } + + if (smp->remote_irk) { + list_del(&smp->remote_irk->list); + kfree(smp->remote_irk); + } + } + + chan->data = NULL; + kfree(smp); + hci_conn_drop(conn->hcon); +} + static void smp_failure(struct l2cap_conn *conn, u8 reason) { struct hci_conn *hcon = conn->hcon; + struct l2cap_chan *chan = conn->smp; if (reason) smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason), &reason); clear_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags); - mgmt_auth_failed(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, - HCI_ERROR_AUTH_FAILURE); - - cancel_delayed_work_sync(&conn->security_timer); + mgmt_auth_failed(hcon, HCI_ERROR_AUTH_FAILURE); - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) + if (chan->data) smp_chan_destroy(conn); } @@ -387,10 +442,12 @@ static const u8 gen_method[5][5] = { static u8 get_auth_method(struct smp_chan *smp, u8 local_io, u8 remote_io) { - /* If either side has unknown io_caps, use JUST WORKS */ + /* If either side has unknown io_caps, use JUST_CFM (which gets + * converted later to JUST_WORKS if we're initiators. + */ if (local_io > SMP_IO_KEYBOARD_DISPLAY || remote_io > SMP_IO_KEYBOARD_DISPLAY) - return JUST_WORKS; + return JUST_CFM; return gen_method[remote_io][local_io]; } @@ -399,7 +456,8 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, u8 local_io, u8 remote_io) { struct hci_conn *hcon = conn->hcon; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; u8 method; u32 passkey = 0; int ret = 0; @@ -410,21 +468,25 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, BT_DBG("tk_request: auth:%d lcl:%d rem:%d", auth, local_io, remote_io); - /* If neither side wants MITM, use JUST WORKS */ - /* Otherwise, look up method from the table */ + /* If neither side wants MITM, either "just" confirm an incoming + * request or use just-works for outgoing ones. The JUST_CFM + * will be converted to JUST_WORKS if necessary later in this + * function. If either side has MITM look up the method from the + * table. + */ if (!(auth & SMP_AUTH_MITM)) - method = JUST_WORKS; + method = JUST_CFM; else method = get_auth_method(smp, local_io, remote_io); - /* If not bonding, don't ask user to confirm a Zero TK */ - if (!(auth & SMP_AUTH_BONDING) && method == JUST_CFM) - method = JUST_WORKS; - /* Don't confirm locally initiated pairing attempts */ if (method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) method = JUST_WORKS; + /* Don't bother user space with no IO capabilities */ + if (method == JUST_CFM && hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) + method = JUST_WORKS; + /* If Just Works, Continue with Zero TK */ if (method == JUST_WORKS) { set_bit(SMP_FLAG_TK_VALID, &smp->flags); @@ -432,14 +494,17 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, } /* Not Just Works/Confirm results in MITM Authentication */ - if (method != JUST_CFM) + if (method != JUST_CFM) { set_bit(SMP_FLAG_MITM_AUTH, &smp->flags); + if (hcon->pending_sec_level < BT_SECURITY_HIGH) + hcon->pending_sec_level = BT_SECURITY_HIGH; + } /* If both devices have Keyoard-Display I/O, the master * Confirms and the slave Enters the passkey. */ if (method == OVERLAP) { - if (hcon->link_mode & HCI_LM_MASTER) + if (hcon->role == HCI_ROLE_MASTER) method = CFM_PASSKEY; else method = REQ_PASSKEY; @@ -477,23 +542,15 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, static u8 smp_confirm(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; - struct hci_dev *hdev = conn->hcon->hdev; - struct crypto_blkcipher *tfm = hdev->tfm_aes; struct smp_cmd_pairing_confirm cp; int ret; BT_DBG("conn %p", conn); - /* Prevent mutual access to hdev->tfm_aes */ - hci_dev_lock(hdev); - - ret = smp_c1(tfm, smp->tk, smp->prnd, smp->preq, smp->prsp, + ret = smp_c1(smp, smp->tk, smp->prnd, smp->preq, smp->prsp, conn->hcon->init_addr_type, &conn->hcon->init_addr, conn->hcon->resp_addr_type, &conn->hcon->resp_addr, cp.confirm_val); - - hci_dev_unlock(hdev); - if (ret) return SMP_UNSPECIFIED; @@ -501,6 +558,11 @@ static u8 smp_confirm(struct smp_chan *smp) smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp); + if (conn->hcon->out) + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); + else + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); + return 0; } @@ -508,25 +570,17 @@ static u8 smp_random(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; - struct hci_dev *hdev = hcon->hdev; - struct crypto_blkcipher *tfm = hdev->tfm_aes; u8 confirm[16]; int ret; - if (IS_ERR_OR_NULL(tfm)) + if (IS_ERR_OR_NULL(smp->tfm_aes)) return SMP_UNSPECIFIED; BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave"); - /* Prevent mutual access to hdev->tfm_aes */ - hci_dev_lock(hdev); - - ret = smp_c1(tfm, smp->tk, smp->rrnd, smp->preq, smp->prsp, + ret = smp_c1(smp, smp->tk, smp->rrnd, smp->preq, smp->prsp, hcon->init_addr_type, &hcon->init_addr, hcon->resp_addr_type, &hcon->resp_addr, confirm); - - hci_dev_unlock(hdev); - if (ret) return SMP_UNSPECIFIED; @@ -540,7 +594,7 @@ static u8 smp_random(struct smp_chan *smp) __le64 rand = 0; __le16 ediv = 0; - smp_s1(tfm, smp->tk, smp->rrnd, smp->prnd, stk); + smp_s1(smp, smp->tk, smp->rrnd, smp->prnd, stk); memset(stk + smp->enc_key_size, 0, SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); @@ -550,6 +604,7 @@ static u8 smp_random(struct smp_chan *smp) hci_le_start_enc(hcon, ediv, rand, stk); hcon->enc_key_size = smp->enc_key_size; + set_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); } else { u8 stk[16], auth; __le64 rand = 0; @@ -558,7 +613,7 @@ static u8 smp_random(struct smp_chan *smp) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); - smp_s1(tfm, smp->tk, smp->prnd, smp->rrnd, stk); + smp_s1(smp, smp->tk, smp->prnd, smp->rrnd, stk); memset(stk + smp->enc_key_size, 0, SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size); @@ -568,80 +623,273 @@ static u8 smp_random(struct smp_chan *smp) else auth = 0; + /* Even though there's no _SLAVE suffix this is the + * slave STK we're adding for later lookup (the master + * STK never needs to be stored). + */ hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, - HCI_SMP_STK_SLAVE, auth, stk, smp->enc_key_size, - ediv, rand); + SMP_STK, auth, stk, smp->enc_key_size, ediv, rand); } return 0; } -static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) +static void smp_notify_keys(struct l2cap_conn *conn) { - struct smp_chan *smp; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; + struct hci_conn *hcon = conn->hcon; + struct hci_dev *hdev = hcon->hdev; + struct smp_cmd_pairing *req = (void *) &smp->preq[1]; + struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1]; + bool persistent; - smp = kzalloc(sizeof(*smp), GFP_ATOMIC); - if (!smp) - return NULL; + if (smp->remote_irk) { + mgmt_new_irk(hdev, smp->remote_irk); + /* Now that user space can be considered to know the + * identity address track the connection based on it + * from now on. + */ + bacpy(&hcon->dst, &smp->remote_irk->bdaddr); + hcon->dst_type = smp->remote_irk->addr_type; + queue_work(hdev->workqueue, &conn->id_addr_update_work); - smp->conn = conn; - conn->smp_chan = smp; - conn->hcon->smp_conn = conn; + /* When receiving an indentity resolving key for + * a remote device that does not use a resolvable + * private address, just remove the key so that + * it is possible to use the controller white + * list for scanning. + * + * Userspace will have been told to not store + * this key at this point. So it is safe to + * just remove it. + */ + if (!bacmp(&smp->remote_irk->rpa, BDADDR_ANY)) { + list_del(&smp->remote_irk->list); + kfree(smp->remote_irk); + smp->remote_irk = NULL; + } + } - hci_conn_hold(conn->hcon); + /* The LTKs and CSRKs should be persistent only if both sides + * had the bonding bit set in their authentication requests. + */ + persistent = !!((req->auth_req & rsp->auth_req) & SMP_AUTH_BONDING); - return smp; + if (smp->csrk) { + smp->csrk->bdaddr_type = hcon->dst_type; + bacpy(&smp->csrk->bdaddr, &hcon->dst); + mgmt_new_csrk(hdev, smp->csrk, persistent); + } + + if (smp->slave_csrk) { + smp->slave_csrk->bdaddr_type = hcon->dst_type; + bacpy(&smp->slave_csrk->bdaddr, &hcon->dst); + mgmt_new_csrk(hdev, smp->slave_csrk, persistent); + } + + if (smp->ltk) { + smp->ltk->bdaddr_type = hcon->dst_type; + bacpy(&smp->ltk->bdaddr, &hcon->dst); + mgmt_new_ltk(hdev, smp->ltk, persistent); + } + + if (smp->slave_ltk) { + smp->slave_ltk->bdaddr_type = hcon->dst_type; + bacpy(&smp->slave_ltk->bdaddr, &hcon->dst); + mgmt_new_ltk(hdev, smp->slave_ltk, persistent); + } } -void smp_chan_destroy(struct l2cap_conn *conn) +static void smp_allow_key_dist(struct smp_chan *smp) { - struct smp_chan *smp = conn->smp_chan; - bool complete; + /* Allow the first expected phase 3 PDU. The rest of the PDUs + * will be allowed in each PDU handler to ensure we receive + * them in the correct order. + */ + if (smp->remote_key_dist & SMP_DIST_ENC_KEY) + SMP_ALLOW_CMD(smp, SMP_CMD_ENCRYPT_INFO); + else if (smp->remote_key_dist & SMP_DIST_ID_KEY) + SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); + else if (smp->remote_key_dist & SMP_DIST_SIGN) + SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); +} - BUG_ON(!smp); +static void smp_distribute_keys(struct smp_chan *smp) +{ + struct smp_cmd_pairing *req, *rsp; + struct l2cap_conn *conn = smp->conn; + struct hci_conn *hcon = conn->hcon; + struct hci_dev *hdev = hcon->hdev; + __u8 *keydist; - complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); - mgmt_smp_complete(conn->hcon, complete); + BT_DBG("conn %p", conn); - kfree(smp->csrk); - kfree(smp->slave_csrk); + rsp = (void *) &smp->prsp[1]; - /* If pairing failed clean up any keys we might have */ - if (!complete) { - if (smp->ltk) { - list_del(&smp->ltk->list); - kfree(smp->ltk); - } + /* The responder sends its keys first */ + if (hcon->out && (smp->remote_key_dist & KEY_DIST_MASK)) { + smp_allow_key_dist(smp); + return; + } - if (smp->slave_ltk) { - list_del(&smp->slave_ltk->list); - kfree(smp->slave_ltk); - } + req = (void *) &smp->preq[1]; - if (smp->remote_irk) { - list_del(&smp->remote_irk->list); - kfree(smp->remote_irk); + if (hcon->out) { + keydist = &rsp->init_key_dist; + *keydist &= req->init_key_dist; + } else { + keydist = &rsp->resp_key_dist; + *keydist &= req->resp_key_dist; + } + + BT_DBG("keydist 0x%x", *keydist); + + if (*keydist & SMP_DIST_ENC_KEY) { + struct smp_cmd_encrypt_info enc; + struct smp_cmd_master_ident ident; + struct smp_ltk *ltk; + u8 authenticated; + __le16 ediv; + __le64 rand; + + get_random_bytes(enc.ltk, sizeof(enc.ltk)); + get_random_bytes(&ediv, sizeof(ediv)); + get_random_bytes(&rand, sizeof(rand)); + + smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc); + + authenticated = hcon->sec_level == BT_SECURITY_HIGH; + ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, + SMP_LTK_SLAVE, authenticated, enc.ltk, + smp->enc_key_size, ediv, rand); + smp->slave_ltk = ltk; + + ident.ediv = ediv; + ident.rand = rand; + + smp_send_cmd(conn, SMP_CMD_MASTER_IDENT, sizeof(ident), &ident); + + *keydist &= ~SMP_DIST_ENC_KEY; + } + + if (*keydist & SMP_DIST_ID_KEY) { + struct smp_cmd_ident_addr_info addrinfo; + struct smp_cmd_ident_info idinfo; + + memcpy(idinfo.irk, hdev->irk, sizeof(idinfo.irk)); + + smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo); + + /* The hci_conn contains the local identity address + * after the connection has been established. + * + * This is true even when the connection has been + * established using a resolvable random address. + */ + bacpy(&addrinfo.bdaddr, &hcon->src); + addrinfo.addr_type = hcon->src_type; + + smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo), + &addrinfo); + + *keydist &= ~SMP_DIST_ID_KEY; + } + + if (*keydist & SMP_DIST_SIGN) { + struct smp_cmd_sign_info sign; + struct smp_csrk *csrk; + + /* Generate a new random key */ + get_random_bytes(sign.csrk, sizeof(sign.csrk)); + + csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); + if (csrk) { + csrk->master = 0x00; + memcpy(csrk->val, sign.csrk, sizeof(csrk->val)); } + smp->slave_csrk = csrk; + + smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign); + + *keydist &= ~SMP_DIST_SIGN; } - kfree(smp); - conn->smp_chan = NULL; - conn->hcon->smp_conn = NULL; - hci_conn_drop(conn->hcon); + /* If there are still keys to be received wait for them */ + if (smp->remote_key_dist & KEY_DIST_MASK) { + smp_allow_key_dist(smp); + return; + } + + set_bit(SMP_FLAG_COMPLETE, &smp->flags); + smp_notify_keys(conn); + + smp_chan_destroy(conn); +} + +static void smp_timeout(struct work_struct *work) +{ + struct smp_chan *smp = container_of(work, struct smp_chan, + security_timer.work); + struct l2cap_conn *conn = smp->conn; + + BT_DBG("conn %p", conn); + + hci_disconnect(conn->hcon, HCI_ERROR_REMOTE_USER_TERM); +} + +static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) +{ + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp; + + smp = kzalloc(sizeof(*smp), GFP_ATOMIC); + if (!smp) + return NULL; + + smp->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(smp->tfm_aes)) { + BT_ERR("Unable to create ECB crypto context"); + kfree(smp); + return NULL; + } + + smp->conn = conn; + chan->data = smp; + + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_FAIL); + + INIT_DELAYED_WORK(&smp->security_timer, smp_timeout); + + hci_conn_hold(conn->hcon); + + return smp; } int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) { - struct l2cap_conn *conn = hcon->smp_conn; + struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_chan *chan; struct smp_chan *smp; u32 value; + int err; BT_DBG(""); if (!conn) return -ENOTCONN; - smp = conn->smp_chan; + chan = conn->smp; + if (!chan) + return -ENOTCONN; + + l2cap_chan_lock(chan); + if (!chan->data) { + err = -ENOTCONN; + goto unlock; + } + + smp = chan->data; switch (mgmt_op) { case MGMT_OP_USER_PASSKEY_REPLY: @@ -656,12 +904,16 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) case MGMT_OP_USER_PASSKEY_NEG_REPLY: case MGMT_OP_USER_CONFIRM_NEG_REPLY: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); - return 0; + err = 0; + goto unlock; default: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto unlock; } + err = 0; + /* If it is our turn to send Pairing Confirm, do so now */ if (test_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) { u8 rsp = smp_confirm(smp); @@ -669,12 +921,16 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) smp_failure(conn, rsp); } - return 0; +unlock: + l2cap_chan_unlock(chan); + return err; } static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing rsp, *req = (void *) skb->data; + struct l2cap_chan *chan = conn->smp; + struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp; u8 key_size, auth, sec_level; int ret; @@ -684,25 +940,33 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*req)) return SMP_INVALID_PARAMS; - if (conn->hcon->link_mode & HCI_LM_MASTER) + if (conn->hcon->role != HCI_ROLE_SLAVE) return SMP_CMD_NOTSUPP; - if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) + if (!chan->data) smp = smp_chan_create(conn); else - smp = conn->smp_chan; + smp = chan->data; if (!smp) return SMP_UNSPECIFIED; + /* We didn't start the pairing, so match remote */ + auth = req->auth_req & AUTH_REQ_MASK; + + if (!test_bit(HCI_BONDABLE, &hdev->dev_flags) && + (auth & SMP_AUTH_BONDING)) + return SMP_PAIRING_NOTSUPP; + smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], req, sizeof(*req)); skb_pull(skb, sizeof(*req)); - /* We didn't start the pairing, so match remote */ - auth = req->auth_req; + if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) + sec_level = BT_SECURITY_MEDIUM; + else + sec_level = authreq_to_seclevel(auth); - sec_level = authreq_to_seclevel(auth); if (sec_level > conn->hcon->pending_sec_level) conn->hcon->pending_sec_level = sec_level; @@ -728,22 +992,22 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(&smp->prsp[1], &rsp, sizeof(rsp)); smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); /* Request setup of TK */ ret = tk_request(conn, 0, auth, rsp.io_capability, req->io_capability); if (ret) return SMP_UNSPECIFIED; - clear_bit(SMP_FLAG_INITIATOR, &smp->flags); - return 0; } static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing *req, *rsp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; - u8 key_size, auth = SMP_AUTH_NONE; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; + u8 key_size, auth; int ret; BT_DBG("conn %p", conn); @@ -751,7 +1015,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*rsp)) return SMP_INVALID_PARAMS; - if (!(conn->hcon->link_mode & HCI_LM_MASTER)) + if (conn->hcon->role != HCI_ROLE_MASTER) return SMP_CMD_NOTSUPP; skb_pull(skb, sizeof(*rsp)); @@ -762,6 +1026,8 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; + auth = rsp->auth_req & AUTH_REQ_MASK; + /* If we need MITM check that it can be acheived */ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { u8 method; @@ -782,11 +1048,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) */ smp->remote_key_dist &= rsp->resp_key_dist; - if ((req->auth_req & SMP_AUTH_BONDING) && - (rsp->auth_req & SMP_AUTH_BONDING)) - auth = SMP_AUTH_BONDING; - - auth |= (req->auth_req | rsp->auth_req) & SMP_AUTH_MITM; + auth |= req->auth_req; ret = tk_request(conn, 0, auth, req->io_capability, rsp->io_capability); if (ret) @@ -803,7 +1065,8 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave"); @@ -813,10 +1076,14 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(smp->pcnf, skb->data, sizeof(smp->pcnf)); skb_pull(skb, sizeof(smp->pcnf)); - if (conn->hcon->out) + if (conn->hcon->out) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); - else if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); + return 0; + } + + if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) return smp_confirm(smp); else set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); @@ -826,7 +1093,8 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) { - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG("conn %p", conn); @@ -839,26 +1107,51 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) return smp_random(smp); } -static u8 smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) +static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) { struct smp_ltk *key; struct hci_conn *hcon = conn->hcon; key = hci_find_ltk_by_addr(hcon->hdev, &hcon->dst, hcon->dst_type, - hcon->out); + hcon->role); if (!key) - return 0; + return false; - if (sec_level > BT_SECURITY_MEDIUM && !key->authenticated) - return 0; + if (smp_ltk_sec_level(key) < sec_level) + return false; if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) - return 1; + return true; hci_le_start_enc(hcon, key->ediv, key->rand, key->val); hcon->enc_key_size = key->enc_size; - return 1; + /* We never store STKs for master role, so clear this flag */ + clear_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); + + return true; +} + +bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level) +{ + if (sec_level == BT_SECURITY_LOW) + return true; + + /* If we're encrypted with an STK always claim insufficient + * security. This way we allow the connection to be re-encrypted + * with an LTK, even if the LTK provides the same level of + * security. Only exception is if we don't have an LTK (e.g. + * because of key distribution bits). + */ + if (test_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags) && + hci_find_ltk_by_addr(hcon->hdev, &hcon->dst, hcon->dst_type, + hcon->role)) + return false; + + if (hcon->sec_level >= sec_level) + return true; + + return false; } static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) @@ -867,59 +1160,61 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_pairing cp; struct hci_conn *hcon = conn->hcon; struct smp_chan *smp; - u8 sec_level; + u8 sec_level, auth; BT_DBG("conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; - if (!(conn->hcon->link_mode & HCI_LM_MASTER)) + if (hcon->role != HCI_ROLE_MASTER) return SMP_CMD_NOTSUPP; - sec_level = authreq_to_seclevel(rp->auth_req); + auth = rp->auth_req & AUTH_REQ_MASK; + + if (hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) + sec_level = BT_SECURITY_MEDIUM; + else + sec_level = authreq_to_seclevel(auth); + + if (smp_sufficient_security(hcon, sec_level)) + return 0; + if (sec_level > hcon->pending_sec_level) hcon->pending_sec_level = sec_level; if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; - if (test_and_set_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) - return 0; - smp = smp_chan_create(conn); + if (!smp) + return SMP_UNSPECIFIED; + + if (!test_bit(HCI_BONDABLE, &hcon->hdev->dev_flags) && + (auth & SMP_AUTH_BONDING)) + return SMP_PAIRING_NOTSUPP; skb_pull(skb, sizeof(*rp)); memset(&cp, 0, sizeof(cp)); - build_pairing_cmd(conn, &cp, NULL, rp->auth_req); + build_pairing_cmd(conn, &cp, NULL, auth); smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], &cp, sizeof(cp)); smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); - - clear_bit(SMP_FLAG_INITIATOR, &smp->flags); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); return 0; } -bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level) -{ - if (sec_level == BT_SECURITY_LOW) - return true; - - if (hcon->sec_level >= sec_level) - return true; - - return false; -} - int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_chan *chan; struct smp_chan *smp; __u8 authreq; + int ret; BT_DBG("conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level); @@ -927,6 +1222,8 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (!conn) return 1; + chan = conn->smp; + if (!test_bit(HCI_LE_ENABLED, &hcon->hdev->dev_flags)) return 1; @@ -936,16 +1233,23 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (sec_level > hcon->pending_sec_level) hcon->pending_sec_level = sec_level; - if (hcon->link_mode & HCI_LM_MASTER) + if (hcon->role == HCI_ROLE_MASTER) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; - if (test_and_set_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) - return 0; + l2cap_chan_lock(chan); + + /* If SMP is already in progress ignore this request */ + if (chan->data) { + ret = 0; + goto unlock; + } smp = smp_chan_create(conn); - if (!smp) - return 1; + if (!smp) { + ret = 1; + goto unlock; + } authreq = seclevel_to_authreq(sec_level); @@ -956,7 +1260,7 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) hcon->pending_sec_level > BT_SECURITY_MEDIUM) authreq |= SMP_AUTH_MITM; - if (hcon->link_mode & HCI_LM_MASTER) { + if (hcon->role == HCI_ROLE_MASTER) { struct smp_cmd_pairing cp; build_pairing_cmd(conn, &cp, NULL, authreq); @@ -964,30 +1268,34 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) memcpy(&smp->preq[1], &cp, sizeof(cp)); smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); } else { struct smp_cmd_security_req cp; cp.auth_req = authreq; smp_send_cmd(conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); } set_bit(SMP_FLAG_INITIATOR, &smp->flags); + ret = 0; - return 0; +unlock: + l2cap_chan_unlock(chan); + return ret; } static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_encrypt_info *rp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG("conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; - /* Ignore this PDU if it wasn't requested */ - if (!(smp->remote_key_dist & SMP_DIST_ENC_KEY)) - return 0; + SMP_ALLOW_CMD(smp, SMP_CMD_MASTER_IDENT); skb_pull(skb, sizeof(*rp)); @@ -999,7 +1307,8 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_master_ident *rp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; struct hci_conn *hcon = conn->hcon; struct smp_ltk *ltk; @@ -1010,23 +1319,24 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; - /* Ignore this PDU if it wasn't requested */ - if (!(smp->remote_key_dist & SMP_DIST_ENC_KEY)) - return 0; - /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ENC_KEY; + if (smp->remote_key_dist & SMP_DIST_ID_KEY) + SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); + else if (smp->remote_key_dist & SMP_DIST_SIGN) + SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); + skb_pull(skb, sizeof(*rp)); hci_dev_lock(hdev); authenticated = (hcon->sec_level == BT_SECURITY_HIGH); - ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, HCI_SMP_LTK, + ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, SMP_LTK, authenticated, smp->tk, smp->enc_key_size, rp->ediv, rp->rand); smp->ltk = ltk; - if (!(smp->remote_key_dist & SMP_DIST_ID_KEY)) - smp_distribute_keys(conn); + if (!(smp->remote_key_dist & KEY_DIST_MASK)) + smp_distribute_keys(smp); hci_dev_unlock(hdev); return 0; @@ -1035,16 +1345,15 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_info *info = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; BT_DBG(""); if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; - /* Ignore this PDU if it wasn't requested */ - if (!(smp->remote_key_dist & SMP_DIST_ID_KEY)) - return 0; + SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO); skb_pull(skb, sizeof(*info)); @@ -1057,7 +1366,8 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_addr_info *info = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; bdaddr_t rpa; @@ -1066,15 +1376,16 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; - /* Ignore this PDU if it wasn't requested */ - if (!(smp->remote_key_dist & SMP_DIST_ID_KEY)) - return 0; - /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ID_KEY; + if (smp->remote_key_dist & SMP_DIST_SIGN) + SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); + skb_pull(skb, sizeof(*info)); + hci_dev_lock(hcon->hdev); + /* Strictly speaking the Core Specification (4.1) allows sending * an empty address which would force us to rely on just the IRK * as "identity information". However, since such @@ -1084,8 +1395,7 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, */ if (!bacmp(&info->bdaddr, BDADDR_ANY)) { BT_ERR("Ignoring IRK with no identity address"); - smp_distribute_keys(conn); - return 0; + goto distribute; } bacpy(&smp->id_addr, &info->bdaddr); @@ -1099,7 +1409,11 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, smp->remote_irk = hci_add_irk(conn->hcon->hdev, &smp->id_addr, smp->id_addr_type, smp->irk, &rpa); - smp_distribute_keys(conn); +distribute: + if (!(smp->remote_key_dist & KEY_DIST_MASK)) + smp_distribute_keys(smp); + + hci_dev_unlock(hcon->hdev); return 0; } @@ -1107,7 +1421,8 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_sign_info *rp = (void *) skb->data; - struct smp_chan *smp = conn->smp_chan; + struct l2cap_chan *chan = conn->smp; + struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; struct smp_csrk *csrk; @@ -1116,10 +1431,6 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; - /* Ignore this PDU if it wasn't requested */ - if (!(smp->remote_key_dist & SMP_DIST_SIGN)) - return 0; - /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_SIGN; @@ -1132,16 +1443,17 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) memcpy(csrk->val, rp->csrk, sizeof(csrk->val)); } smp->csrk = csrk; - if (!(smp->remote_key_dist & SMP_DIST_SIGN)) - smp_distribute_keys(conn); + smp_distribute_keys(smp); hci_dev_unlock(hdev); return 0; } -int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) +static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) { + struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; + struct smp_chan *smp; __u8 code, reason; int err = 0; @@ -1150,13 +1462,10 @@ int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } - if (skb->len < 1) { - kfree_skb(skb); + if (skb->len < 1) return -EILSEQ; - } if (!test_bit(HCI_LE_ENABLED, &hcon->hdev->dev_flags)) { - err = -ENOTSUPP; reason = SMP_PAIRING_NOTSUPP; goto done; } @@ -1164,18 +1473,19 @@ int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) code = skb->data[0]; skb_pull(skb, sizeof(code)); - /* - * The SMP context must be initialized for all other PDUs except - * pairing and security requests. If we get any other PDU when - * not initialized simply disconnect (done if this function - * returns an error). + smp = chan->data; + + if (code > SMP_CMD_MAX) + goto drop; + + if (smp && !test_and_clear_bit(code, &smp->allow_cmd)) + goto drop; + + /* If we don't have a context the only allowed commands are + * pairing request and security request. */ - if (code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ && - !conn->smp_chan) { - BT_ERR("Unexpected SMP command 0x%02x. Disconnecting.", code); - kfree_skb(skb); - return -ENOTSUPP; - } + if (!smp && code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ) + goto drop; switch (code) { case SMP_CMD_PAIRING_REQ: @@ -1184,7 +1494,6 @@ int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) case SMP_CMD_PAIRING_FAIL: smp_failure(conn, 0); - reason = 0; err = -EPERM; break; @@ -1226,181 +1535,217 @@ int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) default: BT_DBG("Unknown command code 0x%2.2x", code); - reason = SMP_CMD_NOTSUPP; - err = -EOPNOTSUPP; goto done; } done: - if (reason) - smp_failure(conn, reason); + if (!err) { + if (reason) + smp_failure(conn, reason); + kfree_skb(skb); + } - kfree_skb(skb); return err; + +drop: + BT_ERR("%s unexpected SMP command 0x%02x from %pMR", hcon->hdev->name, + code, &hcon->dst); + kfree_skb(skb); + return 0; } -static void smp_notify_keys(struct l2cap_conn *conn) +static void smp_teardown_cb(struct l2cap_chan *chan, int err) { - struct smp_chan *smp = conn->smp_chan; - struct hci_conn *hcon = conn->hcon; - struct hci_dev *hdev = hcon->hdev; - struct smp_cmd_pairing *req = (void *) &smp->preq[1]; - struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1]; - bool persistent; + struct l2cap_conn *conn = chan->conn; - if (smp->remote_irk) { - mgmt_new_irk(hdev, smp->remote_irk); - /* Now that user space can be considered to know the - * identity address track the connection based on it - * from now on. - */ - bacpy(&hcon->dst, &smp->remote_irk->bdaddr); - hcon->dst_type = smp->remote_irk->addr_type; - l2cap_conn_update_id_addr(hcon); - } + BT_DBG("chan %p", chan); - /* The LTKs and CSRKs should be persistent only if both sides - * had the bonding bit set in their authentication requests. - */ - persistent = !!((req->auth_req & rsp->auth_req) & SMP_AUTH_BONDING); + if (chan->data) + smp_chan_destroy(conn); - if (smp->csrk) { - smp->csrk->bdaddr_type = hcon->dst_type; - bacpy(&smp->csrk->bdaddr, &hcon->dst); - mgmt_new_csrk(hdev, smp->csrk, persistent); - } + conn->smp = NULL; + l2cap_chan_put(chan); +} - if (smp->slave_csrk) { - smp->slave_csrk->bdaddr_type = hcon->dst_type; - bacpy(&smp->slave_csrk->bdaddr, &hcon->dst); - mgmt_new_csrk(hdev, smp->slave_csrk, persistent); - } +static void smp_resume_cb(struct l2cap_chan *chan) +{ + struct smp_chan *smp = chan->data; + struct l2cap_conn *conn = chan->conn; + struct hci_conn *hcon = conn->hcon; - if (smp->ltk) { - smp->ltk->bdaddr_type = hcon->dst_type; - bacpy(&smp->ltk->bdaddr, &hcon->dst); - mgmt_new_ltk(hdev, smp->ltk, persistent); - } + BT_DBG("chan %p", chan); - if (smp->slave_ltk) { - smp->slave_ltk->bdaddr_type = hcon->dst_type; - bacpy(&smp->slave_ltk->bdaddr, &hcon->dst); - mgmt_new_ltk(hdev, smp->slave_ltk, persistent); - } + if (!smp) + return; + + if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) + return; + + cancel_delayed_work(&smp->security_timer); + + smp_distribute_keys(smp); } -int smp_distribute_keys(struct l2cap_conn *conn) +static void smp_ready_cb(struct l2cap_chan *chan) { - struct smp_cmd_pairing *req, *rsp; - struct smp_chan *smp = conn->smp_chan; - struct hci_conn *hcon = conn->hcon; - struct hci_dev *hdev = hcon->hdev; - __u8 *keydist; + struct l2cap_conn *conn = chan->conn; - BT_DBG("conn %p", conn); + BT_DBG("chan %p", chan); - if (!test_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags)) - return 0; + conn->smp = chan; + l2cap_chan_hold(chan); +} - rsp = (void *) &smp->prsp[1]; +static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) +{ + int err; - /* The responder sends its keys first */ - if (hcon->out && (smp->remote_key_dist & 0x07)) - return 0; + BT_DBG("chan %p", chan); - req = (void *) &smp->preq[1]; + err = smp_sig_channel(chan, skb); + if (err) { + struct smp_chan *smp = chan->data; - if (hcon->out) { - keydist = &rsp->init_key_dist; - *keydist &= req->init_key_dist; - } else { - keydist = &rsp->resp_key_dist; - *keydist &= req->resp_key_dist; + if (smp) + cancel_delayed_work_sync(&smp->security_timer); + + hci_disconnect(chan->conn->hcon, HCI_ERROR_AUTH_FAILURE); } - BT_DBG("keydist 0x%x", *keydist); + return err; +} - if (*keydist & SMP_DIST_ENC_KEY) { - struct smp_cmd_encrypt_info enc; - struct smp_cmd_master_ident ident; - struct smp_ltk *ltk; - u8 authenticated; - __le16 ediv; - __le64 rand; +static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan, + unsigned long hdr_len, + unsigned long len, int nb) +{ + struct sk_buff *skb; - get_random_bytes(enc.ltk, sizeof(enc.ltk)); - get_random_bytes(&ediv, sizeof(ediv)); - get_random_bytes(&rand, sizeof(rand)); + skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); - smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc); + skb->priority = HCI_PRIO_MAX; + bt_cb(skb)->chan = chan; - authenticated = hcon->sec_level == BT_SECURITY_HIGH; - ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, - HCI_SMP_LTK_SLAVE, authenticated, enc.ltk, - smp->enc_key_size, ediv, rand); - smp->slave_ltk = ltk; + return skb; +} - ident.ediv = ediv; - ident.rand = rand; +static const struct l2cap_ops smp_chan_ops = { + .name = "Security Manager", + .ready = smp_ready_cb, + .recv = smp_recv_cb, + .alloc_skb = smp_alloc_skb_cb, + .teardown = smp_teardown_cb, + .resume = smp_resume_cb, + + .new_connection = l2cap_chan_no_new_connection, + .state_change = l2cap_chan_no_state_change, + .close = l2cap_chan_no_close, + .defer = l2cap_chan_no_defer, + .suspend = l2cap_chan_no_suspend, + .set_shutdown = l2cap_chan_no_set_shutdown, + .get_sndtimeo = l2cap_chan_no_get_sndtimeo, + .memcpy_fromiovec = l2cap_chan_no_memcpy_fromiovec, +}; - smp_send_cmd(conn, SMP_CMD_MASTER_IDENT, sizeof(ident), &ident); +static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan) +{ + struct l2cap_chan *chan; - *keydist &= ~SMP_DIST_ENC_KEY; - } + BT_DBG("pchan %p", pchan); - if (*keydist & SMP_DIST_ID_KEY) { - struct smp_cmd_ident_addr_info addrinfo; - struct smp_cmd_ident_info idinfo; + chan = l2cap_chan_create(); + if (!chan) + return NULL; - memcpy(idinfo.irk, hdev->irk, sizeof(idinfo.irk)); + chan->chan_type = pchan->chan_type; + chan->ops = &smp_chan_ops; + chan->scid = pchan->scid; + chan->dcid = chan->scid; + chan->imtu = pchan->imtu; + chan->omtu = pchan->omtu; + chan->mode = pchan->mode; - smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo); + BT_DBG("created chan %p", chan); - /* The hci_conn contains the local identity address - * after the connection has been established. - * - * This is true even when the connection has been - * established using a resolvable random address. - */ - bacpy(&addrinfo.bdaddr, &hcon->src); - addrinfo.addr_type = hcon->src_type; + return chan; +} - smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo), - &addrinfo); +static const struct l2cap_ops smp_root_chan_ops = { + .name = "Security Manager Root", + .new_connection = smp_new_conn_cb, + + /* None of these are implemented for the root channel */ + .close = l2cap_chan_no_close, + .alloc_skb = l2cap_chan_no_alloc_skb, + .recv = l2cap_chan_no_recv, + .state_change = l2cap_chan_no_state_change, + .teardown = l2cap_chan_no_teardown, + .ready = l2cap_chan_no_ready, + .defer = l2cap_chan_no_defer, + .suspend = l2cap_chan_no_suspend, + .resume = l2cap_chan_no_resume, + .set_shutdown = l2cap_chan_no_set_shutdown, + .get_sndtimeo = l2cap_chan_no_get_sndtimeo, + .memcpy_fromiovec = l2cap_chan_no_memcpy_fromiovec, +}; - *keydist &= ~SMP_DIST_ID_KEY; - } +int smp_register(struct hci_dev *hdev) +{ + struct l2cap_chan *chan; + struct crypto_blkcipher *tfm_aes; - if (*keydist & SMP_DIST_SIGN) { - struct smp_cmd_sign_info sign; - struct smp_csrk *csrk; + BT_DBG("%s", hdev->name); - /* Generate a new random key */ - get_random_bytes(sign.csrk, sizeof(sign.csrk)); + tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm_aes)) { + int err = PTR_ERR(tfm_aes); + BT_ERR("Unable to create crypto context"); + return err; + } - csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); - if (csrk) { - csrk->master = 0x00; - memcpy(csrk->val, sign.csrk, sizeof(csrk->val)); - } - smp->slave_csrk = csrk; + chan = l2cap_chan_create(); + if (!chan) { + crypto_free_blkcipher(tfm_aes); + return -ENOMEM; + } - smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign); + chan->data = tfm_aes; - *keydist &= ~SMP_DIST_SIGN; - } + l2cap_add_scid(chan, L2CAP_CID_SMP); - /* If there are still keys to be received wait for them */ - if ((smp->remote_key_dist & 0x07)) - return 0; + l2cap_chan_set_defaults(chan); - clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->flags); - cancel_delayed_work_sync(&conn->security_timer); - set_bit(SMP_FLAG_COMPLETE, &smp->flags); - smp_notify_keys(conn); + bacpy(&chan->src, &hdev->bdaddr); + chan->src_type = BDADDR_LE_PUBLIC; + chan->state = BT_LISTEN; + chan->mode = L2CAP_MODE_BASIC; + chan->imtu = L2CAP_DEFAULT_MTU; + chan->ops = &smp_root_chan_ops; - smp_chan_destroy(conn); + hdev->smp_data = chan; return 0; } + +void smp_unregister(struct hci_dev *hdev) +{ + struct l2cap_chan *chan = hdev->smp_data; + struct crypto_blkcipher *tfm_aes; + + if (!chan) + return; + + BT_DBG("%s chan %p", hdev->name, chan); + + tfm_aes = chan->data; + if (tfm_aes) { + chan->data = NULL; + crypto_free_blkcipher(tfm_aes); + } + + hdev->smp_data = NULL; + l2cap_chan_put(chan); +} diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 5a8dc36460a1..86a683a8b491 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -102,6 +102,8 @@ struct smp_cmd_security_req { __u8 auth_req; } __packed; +#define SMP_CMD_MAX 0x0b + #define SMP_PASSKEY_ENTRY_FAILED 0x01 #define SMP_OOB_NOT_AVAIL 0x02 #define SMP_AUTH_REQUIREMENTS 0x03 @@ -116,17 +118,30 @@ struct smp_cmd_security_req { #define SMP_MIN_ENC_KEY_SIZE 7 #define SMP_MAX_ENC_KEY_SIZE 16 +/* LTK types used in internal storage (struct smp_ltk) */ +enum { + SMP_STK, + SMP_LTK, + SMP_LTK_SLAVE, +}; + +static inline u8 smp_ltk_sec_level(struct smp_ltk *key) +{ + if (key->authenticated) + return BT_SECURITY_HIGH; + + return BT_SECURITY_MEDIUM; +} + /* SMP Commands */ bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); -int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb); -int smp_distribute_keys(struct l2cap_conn *conn); int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey); -void smp_chan_destroy(struct l2cap_conn *conn); +bool smp_irk_matches(struct hci_dev *hdev, u8 irk[16], bdaddr_t *bdaddr); +int smp_generate_rpa(struct hci_dev *hdev, u8 irk[16], bdaddr_t *rpa); -bool smp_irk_matches(struct crypto_blkcipher *tfm, u8 irk[16], - bdaddr_t *bdaddr); -int smp_generate_rpa(struct crypto_blkcipher *tfm, u8 irk[16], bdaddr_t *rpa); +int smp_register(struct hci_dev *hdev); +void smp_unregister(struct hci_dev *hdev); #endif /* __SMP_H */ diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 8590b942bffa..fd7ee03c59b3 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -10,7 +10,9 @@ bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o -bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o +bridge-$(subst m,y,$(CONFIG_BRIDGE_NETFILTER)) += br_nf_core.o + +obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o diff --git a/net/bridge/br.c b/net/bridge/br.c index 1a755a1e5410..44425aff7cba 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -161,7 +161,7 @@ static int __init br_init(void) if (err) goto err_out1; - err = br_netfilter_init(); + err = br_nf_core_init(); if (err) goto err_out2; @@ -179,11 +179,16 @@ static int __init br_init(void) br_fdb_test_addr_hook = br_fdb_test_addr; #endif + pr_info("bridge: automatic filtering via arp/ip/ip6tables has been " + "deprecated. Update your scripts to load br_netfilter if you " + "need this.\n"); + return 0; + err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: - br_netfilter_fini(); + br_nf_core_fini(); err_out2: unregister_pernet_subsys(&br_net_ops); err_out1: @@ -196,20 +201,17 @@ err_out: static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); - br_netlink_fini(); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); - unregister_pernet_subsys(&br_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ - br_netfilter_fini(); + br_nf_core_fini(); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = NULL; #endif - br_fdb_fini(); } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 568cccd39a3d..ffd379db5938 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -36,7 +36,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) u16 vid = 0; rcu_read_lock(); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { br_nf_pre_routing_finish_bridge_slow(skb); rcu_read_unlock(); @@ -88,12 +88,17 @@ out: static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); + int err; br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!br->stats) return -ENOMEM; - return 0; + err = br_vlan_init(br); + if (err) + free_percpu(br->stats); + + return err; } static int br_dev_open(struct net_device *dev) @@ -167,7 +172,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* remember the MTU in the rtable for PMTU */ dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu); #endif @@ -389,5 +394,4 @@ void br_dev_setup(struct net_device *dev) br_netfilter_rtable_init(br); br_stp_timer_init(br); br_multicast_init(br); - br_vlan_init(br); } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b524c36c1273..6f6c95cfe8f2 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -93,7 +93,7 @@ static void fdb_rcu_free(struct rcu_head *head) static void fdb_add_hw(struct net_bridge *br, const unsigned char *addr) { int err; - struct net_bridge_port *p, *tmp; + struct net_bridge_port *p; ASSERT_RTNL(); @@ -107,11 +107,9 @@ static void fdb_add_hw(struct net_bridge *br, const unsigned char *addr) return; undo: - list_for_each_entry(tmp, &br->port_list, list) { - if (tmp == p) - break; - if (!br_promisc_port(tmp)) - dev_uc_del(tmp->dev, addr); + list_for_each_entry_continue_reverse(p, &br->port_list, list) { + if (!br_promisc_port(p)) + dev_uc_del(p->dev, addr); } } @@ -631,7 +629,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; - if (nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id)) + if (fdb->vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id)) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -678,6 +676,7 @@ errout: int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, + struct net_device *filter_dev, int idx) { struct net_bridge *br = netdev_priv(dev); @@ -693,6 +692,19 @@ int br_fdb_dump(struct sk_buff *skb, if (idx < cb->args[0]) goto skip; + if (filter_dev && + (!f->dst || f->dst->dev != filter_dev)) { + if (filter_dev != dev) + goto skip; + /* !f->dst is a speacial case for bridge + * It means the MAC belongs to the bridge + * Therefore need a little more filtering + * we only want to dump the !f->dst case + */ + if (f->dst) + goto skip; + } + if (fdb_fill_info(skb, br, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 056b67b0e277..992ec49a96aa 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -49,6 +49,7 @@ int br_dev_queue_push_xmit(struct sk_buff *skb) return 0; } +EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct sk_buff *skb) { @@ -56,6 +57,7 @@ int br_forward_finish(struct sk_buff *skb) br_dev_queue_push_xmit); } +EXPORT_SYMBOL_GPL(br_forward_finish); static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 3eca3fdf8fe1..ed307db7a12b 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -252,12 +252,12 @@ static void del_nbp(struct net_bridge_port *p) br_fdb_delete_by_port(br, p, 1); nbp_update_port_count(br); + netdev_upper_dev_unlink(dev, br->dev); + dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); - netdev_upper_dev_unlink(dev, br->dev); - br_multicast_del_port(p); kobject_uevent(&p->kobj, KOBJ_REMOVE); @@ -332,7 +332,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->port_no = index; p->flags = BR_LEARNING | BR_FLOOD; br_init_port(p); - p->state = BR_STATE_DISABLED; + br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); br_multicast_add_port(p); @@ -344,7 +344,7 @@ int br_add_bridge(struct net *net, const char *name) struct net_device *dev; int res; - dev = alloc_netdev(sizeof(struct net_bridge), name, + dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN, br_dev_setup); if (!dev) @@ -476,16 +476,16 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (err) goto err3; - err = netdev_master_upper_dev_link(dev, br->dev); + err = netdev_rx_handler_register(dev, br_handle_frame, p); if (err) goto err4; - err = netdev_rx_handler_register(dev, br_handle_frame, p); + dev->priv_flags |= IFF_BRIDGE_PORT; + + err = netdev_master_upper_dev_link(dev, br->dev); if (err) goto err5; - dev->priv_flags |= IFF_BRIDGE_PORT; - dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); @@ -500,6 +500,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); + if (nbp_vlan_init(p)) + netdev_err(dev, "failed to initialize vlan filtering on this port\n"); + spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); @@ -520,7 +523,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) return 0; err5: - netdev_upper_dev_unlink(dev, br->dev); + dev->priv_flags &= ~IFF_BRIDGE_PORT; + netdev_rx_handler_unregister(dev); err4: br_netpoll_disable(p); err3: diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 366c43649079..6fd5522df696 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -140,6 +140,7 @@ drop: kfree_skb(skb); goto out; } +EXPORT_SYMBOL_GPL(br_handle_frame_finish); /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct sk_buff *skb) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index abfa0b65a111..648d79ccf462 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1174,7 +1174,7 @@ static void br_multicast_add_router(struct net_bridge *br, } if (slot) - hlist_add_after_rcu(slot, &port->rlist); + hlist_add_behind_rcu(&port->rlist, slot); else hlist_add_head_rcu(&port->rlist, &br->router_list); } @@ -1822,7 +1822,7 @@ static void br_multicast_query_expired(struct net_bridge *br, if (query->startup_sent < br->multicast_startup_query_count) query->startup_sent++; - rcu_assign_pointer(querier, NULL); + RCU_INIT_POINTER(querier, NULL); br_multicast_send_query(br, NULL, query); spin_unlock(&br->multicast_lock); } @@ -2216,6 +2216,43 @@ unlock: EXPORT_SYMBOL_GPL(br_multicast_list_adjacent); /** + * br_multicast_has_querier_anywhere - Checks for a querier on a bridge + * @dev: The bridge port providing the bridge on which to check for a querier + * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 + * + * Checks whether the given interface has a bridge on top and if so returns + * true if a valid querier exists anywhere on the bridged link layer. + * Otherwise returns false. + */ +bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) +{ + struct net_bridge *br; + struct net_bridge_port *port; + struct ethhdr eth; + bool ret = false; + + rcu_read_lock(); + if (!br_port_exists(dev)) + goto unlock; + + port = br_port_get_rcu(dev); + if (!port || !port->br) + goto unlock; + + br = port->br; + + memset(ð, 0, sizeof(eth)); + eth.h_proto = htons(proto); + + ret = br_multicast_querier_exists(br, ð); + +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere); + +/** * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port * @dev: The bridge port adjacent to which to check for a querier * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index a615264cf01a..1bada53bb195 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -111,66 +111,6 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) pppoe_proto(skb) == htons(PPP_IPV6) && \ brnf_filter_pppoe_tagged) -static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) -{ -} - -static void fake_redirect(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb) -{ -} - -static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) -{ - return NULL; -} - -static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, - struct sk_buff *skb, - const void *daddr) -{ - return NULL; -} - -static unsigned int fake_mtu(const struct dst_entry *dst) -{ - return dst->dev->mtu; -} - -static struct dst_ops fake_dst_ops = { - .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), - .update_pmtu = fake_update_pmtu, - .redirect = fake_redirect, - .cow_metrics = fake_cow_metrics, - .neigh_lookup = fake_neigh_lookup, - .mtu = fake_mtu, -}; - -/* - * Initialize bogus route table used to keep netfilter happy. - * Currently, we fill in the PMTU entry because netfilter - * refragmentation needs it, and the rt_flags entry because - * ipt_REJECT needs it. Future netfilter modules might - * require us to fill additional fields. - */ -static const u32 br_dst_default_metrics[RTAX_MAX] = { - [RTAX_MTU - 1] = 1500, -}; - -void br_netfilter_rtable_init(struct net_bridge *br) -{ - struct rtable *rt = &br->fake_rtable; - - atomic_set(&rt->dst.__refcnt, 1); - rt->dst.dev = br->dev; - rt->dst.path = &rt->dst; - dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; - rt->dst.ops = &fake_dst_ops; -} - static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) { struct net_bridge_port *port; @@ -245,14 +185,6 @@ static inline void nf_bridge_save_header(struct sk_buff *skb) skb->nf_bridge->data, header_size); } -static inline void nf_bridge_update_protocol(struct sk_buff *skb) -{ - if (skb->nf_bridge->mask & BRNF_8021Q) - skb->protocol = htons(ETH_P_8021Q); - else if (skb->nf_bridge->mask & BRNF_PPPoE) - skb->protocol = htons(ETH_P_PPP_SES); -} - /* When handing a packet over to the IP layer * check whether we have a skb that is in the * expected format @@ -320,26 +252,6 @@ drop: return -1; } -/* Fill in the header for fragmented IP packets handled by - * the IPv4 connection tracking code. - */ -int nf_bridge_copy_header(struct sk_buff *skb) -{ - int err; - unsigned int header_size; - - nf_bridge_update_protocol(skb); - header_size = ETH_HLEN + nf_bridge_encap_header_len(skb); - err = skb_cow_head(skb, header_size); - if (err) - return err; - - skb_copy_to_linear_data_offset(skb, -header_size, - skb->nf_bridge->data, header_size); - __skb_push(skb, nf_bridge_encap_header_len(skb)); - return 0; -} - /* PF_BRIDGE/PRE_ROUTING *********************************************/ /* Undo the changes made for ip6tables PREROUTING and continue the * bridge PRE_ROUTING hook. */ @@ -404,6 +316,7 @@ static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) ETH_HLEN-ETH_ALEN); /* tell br_dev_xmit to continue with forwarding */ nf_bridge->mask |= BRNF_BRIDGED_DNAT; + /* FIXME Need to refragment */ ret = neigh->output(neigh, skb); } neigh_release(neigh); @@ -459,6 +372,10 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb) struct nf_bridge_info *nf_bridge = skb->nf_bridge; struct rtable *rt; int err; + int frag_max_size; + + frag_max_size = IPCB(skb)->frag_max_size; + BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size; if (nf_bridge->mask & BRNF_PKT_TYPE) { skb->pkt_type = PACKET_OTHERHOST; @@ -863,13 +780,19 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops, static int br_nf_dev_queue_xmit(struct sk_buff *skb) { int ret; + int frag_max_size; + /* This is wrong! We should preserve the original fragment + * boundaries by preserving frag_list rather than refragmenting. + */ if (skb->protocol == htons(ETH_P_IP) && skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu && !skb_is_gso(skb)) { + frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; if (br_parse_ip_options(skb)) /* Drop invalid packet */ return NF_DROP; + IPCB(skb)->frag_max_size = frag_max_size; ret = ip_fragment(skb, br_dev_queue_push_xmit); } else ret = br_dev_queue_push_xmit(skb); @@ -944,6 +867,11 @@ static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, return NF_ACCEPT; } +void br_netfilter_enable(void) +{ +} +EXPORT_SYMBOL_GPL(br_netfilter_enable); + /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because * br_dev_queue_push_xmit is called afterwards */ static struct nf_hook_ops br_nf_ops[] __read_mostly = { @@ -1059,38 +987,42 @@ static struct ctl_table brnf_table[] = { }; #endif -int __init br_netfilter_init(void) +static int __init br_netfilter_init(void) { int ret; - ret = dst_entries_init(&fake_dst_ops); + ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); if (ret < 0) return ret; - ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - if (ret < 0) { - dst_entries_destroy(&fake_dst_ops); - return ret; - } #ifdef CONFIG_SYSCTL brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); if (brnf_sysctl_header == NULL) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); - dst_entries_destroy(&fake_dst_ops); - return -ENOMEM; + ret = -ENOMEM; + goto err1; } #endif printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; +err1: + nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + return ret; } -void br_netfilter_fini(void) +static void __exit br_netfilter_fini(void) { nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(brnf_sysctl_header); #endif - dst_entries_destroy(&fake_dst_ops); } + +module_init(br_netfilter_init); +module_exit(br_netfilter_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Lennert Buytenhek <buytenh@gnu.org>"); +MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); +MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge"); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 26edb518b839..2ff9706647f2 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -208,7 +208,6 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, int err = 0; struct net_bridge_port *port = br_port_get_rtnl(dev); - /* not a bridge port and */ if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN)) goto out; @@ -258,9 +257,6 @@ static int br_afspec(struct net_bridge *br, } else err = br_vlan_add(br, vinfo->vid, vinfo->flags); - if (err) - break; - break; case RTM_DELLINK: @@ -277,7 +273,7 @@ static int br_afspec(struct net_bridge *br, return err; } -static const struct nla_policy ifla_brport_policy[IFLA_BRPORT_MAX + 1] = { +static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_STATE] = { .type = NLA_U8 }, [IFLA_BRPORT_COST] = { .type = NLA_U32 }, [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 }, @@ -305,7 +301,7 @@ static int br_set_port_state(struct net_bridge_port *p, u8 state) (!netif_oper_up(p->dev) && state != BR_STATE_DISABLED)) return -ENETDOWN; - p->state = state; + br_set_state(p, state); br_log_state(p); br_port_state_selection(p->br); return 0; @@ -383,7 +379,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh) if (p && protinfo) { if (protinfo->nla_type & NLA_F_NESTED) { err = nla_parse_nested(tb, IFLA_BRPORT_MAX, - protinfo, ifla_brport_policy); + protinfo, br_port_policy); if (err) return err; @@ -462,6 +458,88 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev, return register_netdevice(dev); } +static int br_port_slave_changelink(struct net_device *brdev, + struct net_device *dev, + struct nlattr *tb[], + struct nlattr *data[]) +{ + if (!data) + return 0; + return br_setport(br_port_get_rtnl(dev), data); +} + +static int br_port_fill_slave_info(struct sk_buff *skb, + const struct net_device *brdev, + const struct net_device *dev) +{ + return br_port_fill_attrs(skb, br_port_get_rtnl(dev)); +} + +static size_t br_port_get_slave_size(const struct net_device *brdev, + const struct net_device *dev) +{ + return br_port_info_size(); +} + +static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { + [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 }, + [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 }, + [IFLA_BR_MAX_AGE] = { .type = NLA_U32 }, +}; + +static int br_changelink(struct net_device *brdev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct net_bridge *br = netdev_priv(brdev); + int err; + + if (!data) + return 0; + + if (data[IFLA_BR_FORWARD_DELAY]) { + err = br_set_forward_delay(br, nla_get_u32(data[IFLA_BR_FORWARD_DELAY])); + if (err) + return err; + } + + if (data[IFLA_BR_HELLO_TIME]) { + err = br_set_hello_time(br, nla_get_u32(data[IFLA_BR_HELLO_TIME])); + if (err) + return err; + } + + if (data[IFLA_BR_MAX_AGE]) { + err = br_set_max_age(br, nla_get_u32(data[IFLA_BR_MAX_AGE])); + if (err) + return err; + } + + return 0; +} + +static size_t br_get_size(const struct net_device *brdev) +{ + return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ + 0; +} + +static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) +{ + struct net_bridge *br = netdev_priv(brdev); + u32 forward_delay = jiffies_to_clock_t(br->forward_delay); + u32 hello_time = jiffies_to_clock_t(br->hello_time); + u32 age_time = jiffies_to_clock_t(br->max_age); + + if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || + nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || + nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time)) + return -EMSGSIZE; + + return 0; +} + static size_t br_get_link_af_size(const struct net_device *dev) { struct net_port_vlans *pv; @@ -486,12 +564,23 @@ static struct rtnl_af_ops br_af_ops = { }; struct rtnl_link_ops br_link_ops __read_mostly = { - .kind = "bridge", - .priv_size = sizeof(struct net_bridge), - .setup = br_dev_setup, - .validate = br_validate, - .newlink = br_dev_newlink, - .dellink = br_dev_delete, + .kind = "bridge", + .priv_size = sizeof(struct net_bridge), + .setup = br_dev_setup, + .maxtype = IFLA_BRPORT_MAX, + .policy = br_policy, + .validate = br_validate, + .newlink = br_dev_newlink, + .changelink = br_changelink, + .dellink = br_dev_delete, + .get_size = br_get_size, + .fill_info = br_fill_info, + + .slave_maxtype = IFLA_BRPORT_MAX, + .slave_policy = br_port_policy, + .slave_changelink = br_port_slave_changelink, + .get_slave_size = br_port_get_slave_size, + .fill_slave_info = br_port_fill_slave_info, }; int __init br_netlink_init(void) @@ -513,7 +602,7 @@ out_af: return err; } -void __exit br_netlink_fini(void) +void br_netlink_fini(void) { br_mdb_uninit(); rtnl_af_unregister(&br_af_ops); diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c new file mode 100644 index 000000000000..387cb3bd017c --- /dev/null +++ b/net/bridge/br_nf_core.c @@ -0,0 +1,96 @@ +/* + * Handle firewalling core + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek <buytenh@gnu.org> + * Bart De Schuymer <bdschuym@pandora.be> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/in_route.h> +#include <linux/inetdevice.h> +#include <net/route.h> + +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ +} + +static void fake_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ +} + +static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + return NULL; +} + +static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + return NULL; +} + +static unsigned int fake_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops fake_dst_ops = { + .family = AF_INET, + .protocol = cpu_to_be16(ETH_P_IP), + .update_pmtu = fake_update_pmtu, + .redirect = fake_redirect, + .cow_metrics = fake_cow_metrics, + .neigh_lookup = fake_neigh_lookup, + .mtu = fake_mtu, +}; + +/* + * Initialize bogus route table used to keep netfilter happy. + * Currently, we fill in the PMTU entry because netfilter + * refragmentation needs it, and the rt_flags entry because + * ipt_REJECT needs it. Future netfilter modules might + * require us to fill additional fields. + */ +static const u32 br_dst_default_metrics[RTAX_MAX] = { + [RTAX_MTU - 1] = 1500, +}; + +void br_netfilter_rtable_init(struct net_bridge *br) +{ + struct rtable *rt = &br->fake_rtable; + + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.dev = br->dev; + rt->dst.path = &rt->dst; + dst_init_metrics(&rt->dst, br_dst_default_metrics, true); + rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; + rt->dst.ops = &fake_dst_ops; +} + +int __init br_nf_core_init(void) +{ + return dst_entries_init(&fake_dst_ops); +} + +void br_nf_core_fini(void) +{ + dst_entries_destroy(&fake_dst_ops); +} diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 23caf5b0309e..4d783d071305 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -221,7 +221,7 @@ struct net_bridge struct pcpu_sw_netstats __percpu *stats; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct rtable fake_rtable; bool nf_call_iptables; bool nf_call_ip6tables; @@ -299,16 +299,24 @@ struct net_bridge #ifdef CONFIG_BRIDGE_VLAN_FILTERING u8 vlan_enabled; __be16 vlan_proto; + u16 default_pvid; struct net_port_vlans __rcu *vlan_info; #endif }; struct br_input_skb_cb { struct net_device *brdev; + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING int igmp; int mrouters_only; #endif + + u16 frag_max_size; + +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + bool vlan_filtered; +#endif }; #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) @@ -399,7 +407,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 nlh_flags); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct net_device *dev, int idx); + struct net_device *dev, struct net_device *fdev, int idx); int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); @@ -601,11 +609,13 @@ bool br_vlan_find(struct net_bridge *br, u16 vid); void br_recalculate_fwd_mask(struct net_bridge *br); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); -void br_vlan_init(struct net_bridge *br); +int br_vlan_init(struct net_bridge *br); +int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); bool nbp_vlan_find(struct net_bridge_port *port, u16 vid); +int nbp_vlan_init(struct net_bridge_port *port); static inline struct net_port_vlans *br_get_vlan_info( const struct net_bridge *br) @@ -638,11 +648,11 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) static inline u16 br_get_pvid(const struct net_port_vlans *v) { - /* Return just the VID if it is set, or VLAN_N_VID (invalid vid) if - * vid wasn't set - */ + if (!v) + return 0; + smp_rmb(); - return v->pvid ?: VLAN_N_VID; + return v->pvid; } static inline int br_vlan_enabled(struct net_bridge *br) @@ -701,8 +711,9 @@ static inline void br_recalculate_fwd_mask(struct net_bridge *br) { } -static inline void br_vlan_init(struct net_bridge *br) +static inline int br_vlan_init(struct net_bridge *br) { + return 0; } static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags) @@ -735,13 +746,18 @@ static inline bool nbp_vlan_find(struct net_bridge_port *port, u16 vid) return false; } +static inline int nbp_vlan_init(struct net_bridge_port *port) +{ + return 0; +} + static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag) { return 0; } static inline u16 br_get_pvid(const struct net_port_vlans *v) { - return VLAN_N_VID; /* Returns invalid vid */ + return 0; } static inline int br_vlan_enabled(struct net_bridge *br) @@ -751,18 +767,19 @@ static inline int br_vlan_enabled(struct net_bridge *br) #endif /* br_netfilter.c */ -#ifdef CONFIG_BRIDGE_NETFILTER -int br_netfilter_init(void); -void br_netfilter_fini(void); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) +int br_nf_core_init(void); +void br_nf_core_fini(void); void br_netfilter_rtable_init(struct net_bridge *); #else -#define br_netfilter_init() (0) -#define br_netfilter_fini() do { } while (0) +static inline int br_nf_core_init(void) { return 0; } +static inline void br_nf_core_fini(void) {} #define br_netfilter_rtable_init(x) #endif /* br_stp.c */ void br_log_state(const struct net_bridge_port *p); +void br_set_state(struct net_bridge_port *p, unsigned int state); struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no); void br_init_port(struct net_bridge_port *p); void br_become_designated_port(struct net_bridge_port *p); diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 3c86f0538cbb..2b047bcf42a4 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -36,6 +36,11 @@ void br_log_state(const struct net_bridge_port *p) br_port_state_names[p->state]); } +void br_set_state(struct net_bridge_port *p, unsigned int state) +{ + p->state = state; +} + /* called under bridge lock */ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no) { @@ -107,7 +112,7 @@ static void br_root_port_block(const struct net_bridge *br, br_notice(br, "port %u(%s) tried to become root port (blocked)", (unsigned int) p->port_no, p->dev->name); - p->state = BR_STATE_LISTENING; + br_set_state(p, BR_STATE_LISTENING); br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); @@ -387,7 +392,7 @@ static void br_make_blocking(struct net_bridge_port *p) p->state == BR_STATE_LEARNING) br_topology_change_detection(p->br); - p->state = BR_STATE_BLOCKING; + br_set_state(p, BR_STATE_BLOCKING); br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); @@ -404,13 +409,13 @@ static void br_make_forwarding(struct net_bridge_port *p) return; if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) { - p->state = BR_STATE_FORWARDING; + br_set_state(p, BR_STATE_FORWARDING); br_topology_change_detection(br); del_timer(&p->forward_delay_timer); } else if (br->stp_enabled == BR_KERNEL_STP) - p->state = BR_STATE_LISTENING; + br_set_state(p, BR_STATE_LISTENING); else - p->state = BR_STATE_LEARNING; + br_set_state(p, BR_STATE_LEARNING); br_multicast_enable_port(p); br_log_state(p); diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 189ba1e7d851..41146872c1b4 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -37,7 +37,7 @@ void br_init_port(struct net_bridge_port *p) { p->port_id = br_make_port_id(p->priority, p->port_no); br_become_designated_port(p); - p->state = BR_STATE_BLOCKING; + br_set_state(p, BR_STATE_BLOCKING); p->topology_change_ack = 0; p->config_pending = 0; } @@ -100,7 +100,7 @@ void br_stp_disable_port(struct net_bridge_port *p) wasroot = br_is_root_bridge(br); br_become_designated_port(p); - p->state = BR_STATE_DISABLED; + br_set_state(p, BR_STATE_DISABLED); p->topology_change_ack = 0; p->config_pending = 0; diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 558c46d19e05..4fcaa67750fd 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -87,11 +87,11 @@ static void br_forward_delay_timer_expired(unsigned long arg) (unsigned int) p->port_no, p->dev->name); spin_lock(&br->lock); if (p->state == BR_STATE_LISTENING) { - p->state = BR_STATE_LEARNING; + br_set_state(p, BR_STATE_LEARNING); mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); } else if (p->state == BR_STATE_LEARNING) { - p->state = BR_STATE_FORWARDING; + br_set_state(p, BR_STATE_FORWARDING); if (br_is_designated_for_some_port(br)) br_topology_change_detection(br); netif_carrier_on(br->dev); diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index c9e2572b15f4..4c97fc50fb70 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -629,7 +629,7 @@ static ssize_t multicast_startup_query_interval_store( } static DEVICE_ATTR_RW(multicast_startup_query_interval); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { @@ -725,6 +725,22 @@ static ssize_t vlan_protocol_store(struct device *d, return store_bridge_parm(d, buf, len, br_vlan_set_proto); } static DEVICE_ATTR_RW(vlan_protocol); + +static ssize_t default_pvid_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%d\n", br->default_pvid); +} + +static ssize_t default_pvid_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid); +} +static DEVICE_ATTR_RW(default_pvid); #endif static struct attribute *bridge_attrs[] = { @@ -763,7 +779,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, &dev_attr_nf_call_ip6tables.attr, &dev_attr_nf_call_arptables.attr, @@ -771,6 +787,7 @@ static struct attribute *bridge_attrs[] = { #ifdef CONFIG_BRIDGE_VLAN_FILTERING &dev_attr_vlan_filtering.attr, &dev_attr_vlan_protocol.attr, + &dev_attr_default_pvid.attr, #endif NULL }; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 2b2774fe0703..150048fb99b0 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -27,9 +27,13 @@ static void __vlan_add_flags(struct net_port_vlans *v, u16 vid, u16 flags) { if (flags & BRIDGE_VLAN_INFO_PVID) __vlan_add_pvid(v, vid); + else + __vlan_delete_pvid(v, vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) set_bit(vid, v->untagged_bitmap); + else + clear_bit(vid, v->untagged_bitmap); } static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) @@ -55,10 +59,8 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags) if (p) { /* Add VLAN to the device filter if it is supported. - * Stricly speaking, this is not necessary now, since - * devices are made promiscuous by the bridge, but if - * that ever changes this code will allow tagged - * traffic to enter the bridge. + * This ensures tagged traffic enters the bridge when + * promiscuous mode is disabled by br_manage_promisc(). */ err = vlan_vid_add(dev, br->vlan_proto, vid); if (err) @@ -127,7 +129,8 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, { u16 vid; - if (!br->vlan_enabled) + /* If this packet was not filtered at input, let it pass */ + if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) goto out; /* Vlan filter table must be configured at this point. The @@ -166,8 +169,10 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, /* If VLAN filtering is disabled on the bridge, all packets are * permitted. */ - if (!br->vlan_enabled) + if (!br->vlan_enabled) { + BR_INPUT_SKB_CB(skb)->vlan_filtered = false; return true; + } /* If there are no vlan in the permitted list, all packets are * rejected. @@ -175,6 +180,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, if (!v) goto drop; + BR_INPUT_SKB_CB(skb)->vlan_filtered = true; proto = br->vlan_proto; /* If vlan tx offload is disabled on bridge device and frame was @@ -183,7 +189,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, */ if (unlikely(!vlan_tx_tag_present(skb) && skb->protocol == proto)) { - skb = vlan_untag(skb); + skb = skb_vlan_untag(skb); if (unlikely(!skb)) return false; } @@ -217,7 +223,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v, * See if pvid is set on this port. That tells us which * vlan untagged or priority-tagged traffic belongs to. */ - if (pvid == VLAN_N_VID) + if (!pvid) goto drop; /* PVID is set on this port. Any untagged or priority-tagged @@ -253,7 +259,8 @@ bool br_allowed_egress(struct net_bridge *br, { u16 vid; - if (!br->vlan_enabled) + /* If this packet was not filtered at input, let it pass */ + if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) return true; if (!v) @@ -272,6 +279,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) struct net_bridge *br = p->br; struct net_port_vlans *v; + /* If filtering was disabled at input, let it pass. */ if (!br->vlan_enabled) return true; @@ -284,7 +292,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) if (!*vid) { *vid = br_get_pvid(v); - if (*vid == VLAN_N_VID) + if (!*vid) return false; return true; @@ -491,9 +499,141 @@ err_filt: goto unlock; } -void br_vlan_init(struct net_bridge *br) +static bool vlan_default_pvid(struct net_port_vlans *pv, u16 vid) +{ + return pv && vid == pv->pvid && test_bit(vid, pv->untagged_bitmap); +} + +static void br_vlan_disable_default_pvid(struct net_bridge *br) +{ + struct net_bridge_port *p; + u16 pvid = br->default_pvid; + + /* Disable default_pvid on all ports where it is still + * configured. + */ + if (vlan_default_pvid(br_get_vlan_info(br), pvid)) + br_vlan_delete(br, pvid); + + list_for_each_entry(p, &br->port_list, list) { + if (vlan_default_pvid(nbp_get_vlan_info(p), pvid)) + nbp_vlan_delete(p, pvid); + } + + br->default_pvid = 0; +} + +static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid) +{ + struct net_bridge_port *p; + u16 old_pvid; + int err = 0; + unsigned long *changed; + + changed = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long), + GFP_KERNEL); + if (!changed) + return -ENOMEM; + + old_pvid = br->default_pvid; + + /* Update default_pvid config only if we do not conflict with + * user configuration. + */ + if ((!old_pvid || vlan_default_pvid(br_get_vlan_info(br), old_pvid)) && + !br_vlan_find(br, pvid)) { + err = br_vlan_add(br, pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED); + if (err) + goto out; + br_vlan_delete(br, old_pvid); + set_bit(0, changed); + } + + list_for_each_entry(p, &br->port_list, list) { + /* Update default_pvid config only if we do not conflict with + * user configuration. + */ + if ((old_pvid && + !vlan_default_pvid(nbp_get_vlan_info(p), old_pvid)) || + nbp_vlan_find(p, pvid)) + continue; + + err = nbp_vlan_add(p, pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED); + if (err) + goto err_port; + nbp_vlan_delete(p, old_pvid); + set_bit(p->port_no, changed); + } + + br->default_pvid = pvid; + +out: + kfree(changed); + return err; + +err_port: + list_for_each_entry_continue_reverse(p, &br->port_list, list) { + if (!test_bit(p->port_no, changed)) + continue; + + if (old_pvid) + nbp_vlan_add(p, old_pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED); + nbp_vlan_delete(p, pvid); + } + + if (test_bit(0, changed)) { + if (old_pvid) + br_vlan_add(br, old_pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED); + br_vlan_delete(br, pvid); + } + goto out; +} + +int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) +{ + u16 pvid = val; + int err = 0; + + if (val >= VLAN_VID_MASK) + return -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (pvid == br->default_pvid) + goto unlock; + + /* Only allow default pvid change when filtering is disabled */ + if (br->vlan_enabled) { + pr_info_once("Please disable vlan filtering to change default_pvid\n"); + err = -EPERM; + goto unlock; + } + + if (!pvid) + br_vlan_disable_default_pvid(br); + else + err = __br_vlan_set_default_pvid(br, pvid); + +unlock: + rtnl_unlock(); + return err; +} + +int br_vlan_init(struct net_bridge *br) { br->vlan_proto = htons(ETH_P_8021Q); + br->default_pvid = 1; + return br_vlan_add(br, 1, + BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED); } /* Must be protected by RTNL. @@ -585,3 +725,12 @@ out: rcu_read_unlock(); return found; } + +int nbp_vlan_init(struct net_bridge_port *p) +{ + return p->br->default_pvid ? + nbp_vlan_add(p, p->br->default_pvid, + BRIDGE_VLAN_INFO_PVID | + BRIDGE_VLAN_INFO_UNTAGGED) : + 0; +} diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 629dc77874a9..9cebf47ac840 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -14,6 +14,15 @@ config NFT_BRIDGE_META help Add support for bridge dedicated meta key. +config NFT_BRIDGE_REJECT + tristate "Netfilter nf_tables bridge reject support" + depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6 + help + Add support to reject packets. + +config NF_LOG_BRIDGE + tristate "Bridge packet logging" + endif # NF_TABLES_BRIDGE menuconfig BRIDGE_NF_EBTABLES @@ -202,22 +211,6 @@ config BRIDGE_EBT_LOG To compile it as a module, choose M here. If unsure, say N. -config BRIDGE_EBT_ULOG - tristate "ebt: ulog support (OBSOLETE)" - help - This option enables the old bridge-specific "ebt_ulog" implementation - which has been obsoleted by the new "nfnetlink_log" code (see - CONFIG_NETFILTER_NETLINK_LOG). - - This option adds the ulog watcher, that you can use in any rule - in any ebtables table. The packet is passed to a userspace - logging daemon using netlink multicast sockets. This differs - from the log watcher in the sense that the complete packet is - sent to userspace instead of a descriptive text and that - netlink multicast sockets are used instead of the syslog. - - To compile it as a module, choose M here. If unsure, say N. - config BRIDGE_EBT_NFLOG tristate "ebt: nflog support" help diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 6f2f3943d66f..be4d0cea78ce 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -4,6 +4,10 @@ obj-$(CONFIG_NF_TABLES_BRIDGE) += nf_tables_bridge.o obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o +obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o + +# packet logging +obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o @@ -33,5 +37,4 @@ obj-$(CONFIG_BRIDGE_EBT_SNAT) += ebt_snat.o # watchers obj-$(CONFIG_BRIDGE_EBT_LOG) += ebt_log.o -obj-$(CONFIG_BRIDGE_EBT_ULOG) += ebt_ulog.o obj-$(CONFIG_BRIDGE_EBT_NFLOG) += ebt_nflog.o diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 5322a36867a3..17f2e4bc2a29 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -186,6 +186,10 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par) li.u.log.level = info->loglevel; li.u.log.logflags = info->bitmask; + /* Remember that we have to use ebt_log_packet() not to break backward + * compatibility. We cannot use the default bridge packet logger via + * nf_log_packet() with NFT_LOG_TYPE_LOG here. --Pablo + */ if (info->bitmask & EBT_LOG_NFLOG) nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in, par->out, &li, "%s", info->prefix); @@ -205,54 +209,13 @@ static struct xt_target ebt_log_tg_reg __read_mostly = { .me = THIS_MODULE, }; -static struct nf_logger ebt_log_logger __read_mostly = { - .name = "ebt_log", - .logfn = &ebt_log_packet, - .me = THIS_MODULE, -}; - -static int __net_init ebt_log_net_init(struct net *net) -{ - nf_log_set(net, NFPROTO_BRIDGE, &ebt_log_logger); - return 0; -} - -static void __net_exit ebt_log_net_fini(struct net *net) -{ - nf_log_unset(net, &ebt_log_logger); -} - -static struct pernet_operations ebt_log_net_ops = { - .init = ebt_log_net_init, - .exit = ebt_log_net_fini, -}; - static int __init ebt_log_init(void) { - int ret; - - ret = register_pernet_subsys(&ebt_log_net_ops); - if (ret < 0) - goto err_pernet; - - ret = xt_register_target(&ebt_log_tg_reg); - if (ret < 0) - goto err_target; - - nf_log_register(NFPROTO_BRIDGE, &ebt_log_logger); - - return ret; - -err_target: - unregister_pernet_subsys(&ebt_log_net_ops); -err_pernet: - return ret; + return xt_register_target(&ebt_log_tg_reg); } static void __exit ebt_log_fini(void) { - unregister_pernet_subsys(&ebt_log_net_ops); - nf_log_unregister(&ebt_log_logger); xt_unregister_target(&ebt_log_tg_reg); } diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c deleted file mode 100644 index 7c470c371e14..000000000000 --- a/net/bridge/netfilter/ebt_ulog.c +++ /dev/null @@ -1,393 +0,0 @@ -/* - * netfilter module for userspace bridged Ethernet frames logging daemons - * - * Authors: - * Bart De Schuymer <bdschuym@pandora.be> - * Harald Welte <laforge@netfilter.org> - * - * November, 2004 - * - * Based on ipt_ULOG.c, which is - * (C) 2000-2002 by Harald Welte <laforge@netfilter.org> - * - * This module accepts two parameters: - * - * nlbufsiz: - * The parameter specifies how big the buffer for each netlink multicast - * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will - * get accumulated in the kernel until they are sent to userspace. It is - * NOT possible to allocate more than 128kB, and it is strongly discouraged, - * because atomically allocating 128kB inside the network rx softirq is not - * reliable. Please also keep in mind that this buffer size is allocated for - * each nlgroup you are using, so the total kernel memory usage increases - * by that factor. - * - * flushtimeout: - * Specify, after how many hundredths of a second the queue should be - * flushed even if it is not full yet. - * - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/socket.h> -#include <linux/skbuff.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <net/netlink.h> -#include <linux/netdevice.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_bridge/ebtables.h> -#include <linux/netfilter_bridge/ebt_ulog.h> -#include <net/netfilter/nf_log.h> -#include <net/netns/generic.h> -#include <net/sock.h> -#include "../br_private.h" - -static unsigned int nlbufsiz = NLMSG_GOODSIZE; -module_param(nlbufsiz, uint, 0600); -MODULE_PARM_DESC(nlbufsiz, "netlink buffer size (number of bytes) " - "(defaults to 4096)"); - -static unsigned int flushtimeout = 10; -module_param(flushtimeout, uint, 0600); -MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths ofa second) " - "(defaults to 10)"); - -typedef struct { - unsigned int qlen; /* number of nlmsgs' in the skb */ - struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ - struct sk_buff *skb; /* the pre-allocated skb */ - struct timer_list timer; /* the timer function */ - spinlock_t lock; /* the per-queue lock */ -} ebt_ulog_buff_t; - -static int ebt_ulog_net_id __read_mostly; -struct ebt_ulog_net { - unsigned int nlgroup[EBT_ULOG_MAXNLGROUPS]; - ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS]; - struct sock *ebtulognl; -}; - -static struct ebt_ulog_net *ebt_ulog_pernet(struct net *net) -{ - return net_generic(net, ebt_ulog_net_id); -} - -/* send one ulog_buff_t to userspace */ -static void ulog_send(struct ebt_ulog_net *ebt, unsigned int nlgroup) -{ - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[nlgroup]; - - del_timer(&ub->timer); - - if (!ub->skb) - return; - - /* last nlmsg needs NLMSG_DONE */ - if (ub->qlen > 1) - ub->lastnlh->nlmsg_type = NLMSG_DONE; - - NETLINK_CB(ub->skb).dst_group = nlgroup + 1; - netlink_broadcast(ebt->ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC); - - ub->qlen = 0; - ub->skb = NULL; -} - -/* timer function to flush queue in flushtimeout time */ -static void ulog_timer(unsigned long data) -{ - struct ebt_ulog_net *ebt = container_of((void *)data, - struct ebt_ulog_net, - nlgroup[*(unsigned int *)data]); - - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[*(unsigned int *)data]; - spin_lock_bh(&ub->lock); - if (ub->skb) - ulog_send(ebt, *(unsigned int *)data); - spin_unlock_bh(&ub->lock); -} - -static struct sk_buff *ulog_alloc_skb(unsigned int size) -{ - struct sk_buff *skb; - unsigned int n; - - n = max(size, nlbufsiz); - skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); - if (!skb) { - if (n > size) { - /* try to allocate only as much as we need for - * current packet */ - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - pr_debug("cannot even allocate buffer of size %ub\n", - size); - } - } - - return skb; -} - -static void ebt_ulog_packet(struct net *net, unsigned int hooknr, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct ebt_ulog_info *uloginfo, - const char *prefix) -{ - ebt_ulog_packet_msg_t *pm; - size_t size, copy_len; - struct nlmsghdr *nlh; - struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); - unsigned int group = uloginfo->nlgroup; - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[group]; - spinlock_t *lock = &ub->lock; - ktime_t kt; - - if ((uloginfo->cprange == 0) || - (uloginfo->cprange > skb->len + ETH_HLEN)) - copy_len = skb->len + ETH_HLEN; - else - copy_len = uloginfo->cprange; - - size = nlmsg_total_size(sizeof(*pm) + copy_len); - if (size > nlbufsiz) { - pr_debug("Size %Zd needed, but nlbufsiz=%d\n", size, nlbufsiz); - return; - } - - spin_lock_bh(lock); - - if (!ub->skb) { - if (!(ub->skb = ulog_alloc_skb(size))) - goto unlock; - } else if (size > skb_tailroom(ub->skb)) { - ulog_send(ebt, group); - - if (!(ub->skb = ulog_alloc_skb(size))) - goto unlock; - } - - nlh = nlmsg_put(ub->skb, 0, ub->qlen, 0, - size - NLMSG_ALIGN(sizeof(*nlh)), 0); - if (!nlh) { - kfree_skb(ub->skb); - ub->skb = NULL; - goto unlock; - } - ub->qlen++; - - pm = nlmsg_data(nlh); - memset(pm, 0, sizeof(*pm)); - - /* Fill in the ulog data */ - pm->version = EBT_ULOG_VERSION; - kt = ktime_get_real(); - pm->stamp = ktime_to_timeval(kt); - if (ub->qlen == 1) - ub->skb->tstamp = kt; - pm->data_len = copy_len; - pm->mark = skb->mark; - pm->hook = hooknr; - if (uloginfo->prefix != NULL) - strcpy(pm->prefix, uloginfo->prefix); - - if (in) { - strcpy(pm->physindev, in->name); - /* If in isn't a bridge, then physindev==indev */ - if (br_port_exists(in)) - /* rcu_read_lock()ed by nf_hook_slow */ - strcpy(pm->indev, br_port_get_rcu(in)->br->dev->name); - else - strcpy(pm->indev, in->name); - } - - if (out) { - /* If out exists, then out is a bridge port */ - strcpy(pm->physoutdev, out->name); - /* rcu_read_lock()ed by nf_hook_slow */ - strcpy(pm->outdev, br_port_get_rcu(out)->br->dev->name); - } - - if (skb_copy_bits(skb, -ETH_HLEN, pm->data, copy_len) < 0) - BUG(); - - if (ub->qlen > 1) - ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; - - ub->lastnlh = nlh; - - if (ub->qlen >= uloginfo->qthreshold) - ulog_send(ebt, group); - else if (!timer_pending(&ub->timer)) { - ub->timer.expires = jiffies + flushtimeout * HZ / 100; - add_timer(&ub->timer); - } - -unlock: - spin_unlock_bh(lock); -} - -/* this function is registered with the netfilter core */ -static void ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, - const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct nf_loginfo *li, - const char *prefix) -{ - struct ebt_ulog_info loginfo; - - if (!li || li->type != NF_LOG_TYPE_ULOG) { - loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP; - loginfo.cprange = 0; - loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD; - loginfo.prefix[0] = '\0'; - } else { - loginfo.nlgroup = li->u.ulog.group; - loginfo.cprange = li->u.ulog.copy_len; - loginfo.qthreshold = li->u.ulog.qthreshold; - strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); - } - - ebt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); -} - -static unsigned int -ebt_ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - - ebt_ulog_packet(net, par->hooknum, skb, par->in, par->out, - par->targinfo, NULL); - return EBT_CONTINUE; -} - -static int ebt_ulog_tg_check(const struct xt_tgchk_param *par) -{ - struct ebt_ulog_info *uloginfo = par->targinfo; - - if (!par->net->xt.ebt_ulog_warn_deprecated) { - pr_info("ebt_ulog is deprecated and it will be removed soon, " - "use ebt_nflog instead\n"); - par->net->xt.ebt_ulog_warn_deprecated = true; - } - - if (uloginfo->nlgroup > 31) - return -EINVAL; - - uloginfo->prefix[EBT_ULOG_PREFIX_LEN - 1] = '\0'; - - if (uloginfo->qthreshold > EBT_ULOG_MAX_QLEN) - uloginfo->qthreshold = EBT_ULOG_MAX_QLEN; - - return 0; -} - -static struct xt_target ebt_ulog_tg_reg __read_mostly = { - .name = "ulog", - .revision = 0, - .family = NFPROTO_BRIDGE, - .target = ebt_ulog_tg, - .checkentry = ebt_ulog_tg_check, - .targetsize = sizeof(struct ebt_ulog_info), - .me = THIS_MODULE, -}; - -static struct nf_logger ebt_ulog_logger __read_mostly = { - .name = "ebt_ulog", - .logfn = &ebt_log_packet, - .me = THIS_MODULE, -}; - -static int __net_init ebt_ulog_net_init(struct net *net) -{ - int i; - struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); - - struct netlink_kernel_cfg cfg = { - .groups = EBT_ULOG_MAXNLGROUPS, - }; - - /* initialize ulog_buffers */ - for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { - ebt->nlgroup[i] = i; - setup_timer(&ebt->ulog_buffers[i].timer, ulog_timer, - (unsigned long)&ebt->nlgroup[i]); - spin_lock_init(&ebt->ulog_buffers[i].lock); - } - - ebt->ebtulognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg); - if (!ebt->ebtulognl) - return -ENOMEM; - - nf_log_set(net, NFPROTO_BRIDGE, &ebt_ulog_logger); - return 0; -} - -static void __net_exit ebt_ulog_net_fini(struct net *net) -{ - int i; - struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); - - nf_log_unset(net, &ebt_ulog_logger); - for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { - ebt_ulog_buff_t *ub = &ebt->ulog_buffers[i]; - del_timer(&ub->timer); - - if (ub->skb) { - kfree_skb(ub->skb); - ub->skb = NULL; - } - } - netlink_kernel_release(ebt->ebtulognl); -} - -static struct pernet_operations ebt_ulog_net_ops = { - .init = ebt_ulog_net_init, - .exit = ebt_ulog_net_fini, - .id = &ebt_ulog_net_id, - .size = sizeof(struct ebt_ulog_net), -}; - -static int __init ebt_ulog_init(void) -{ - int ret; - - if (nlbufsiz >= 128*1024) { - pr_warn("Netlink buffer has to be <= 128kB," - "please try a smaller nlbufsiz parameter.\n"); - return -EINVAL; - } - - ret = register_pernet_subsys(&ebt_ulog_net_ops); - if (ret) - goto out_pernet; - - ret = xt_register_target(&ebt_ulog_tg_reg); - if (ret) - goto out_target; - - nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger); - - return 0; - -out_target: - unregister_pernet_subsys(&ebt_ulog_net_ops); -out_pernet: - return ret; -} - -static void __exit ebt_ulog_fini(void) -{ - nf_log_unregister(&ebt_ulog_logger); - xt_unregister_target(&ebt_ulog_tg_reg); - unregister_pernet_subsys(&ebt_ulog_net_ops); -} - -module_init(ebt_ulog_init); -module_exit(ebt_ulog_fini); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); -MODULE_DESCRIPTION("Ebtables: Packet logging to netlink using ULOG"); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 1059ed3bc255..d9a8c05d995d 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -26,6 +26,7 @@ #include <asm/uaccess.h> #include <linux/smp.h> #include <linux/cpumask.h> +#include <linux/audit.h> #include <net/sock.h> /* needed for logical [in,out]-dev filtering */ #include "../br_private.h" @@ -327,10 +328,7 @@ find_inlist_lock_noload(struct list_head *head, const char *name, int *error, char name[EBT_FUNCTION_MAXNAMELEN]; } *e; - *error = mutex_lock_interruptible(mutex); - if (*error != 0) - return NULL; - + mutex_lock(mutex); list_for_each_entry(e, head, list) { if (strcmp(e->name, name) == 0) return e; @@ -1061,6 +1059,20 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, vfree(table); vfree(counterstmp); + +#ifdef CONFIG_AUDIT + if (audit_enabled) { + struct audit_buffer *ab; + + ab = audit_log_start(current->audit_context, GFP_KERNEL, + AUDIT_NETFILTER_CFG); + if (ab) { + audit_log_format(ab, "table=%s family=%u entries=%u", + repl->name, AF_BRIDGE, repl->nentries); + audit_log_end(ab); + } + } +#endif return ret; free_unlock: @@ -1203,10 +1215,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table) table->private = newinfo; rwlock_init(&table->lock); - ret = mutex_lock_interruptible(&ebt_mutex); - if (ret != 0) - goto free_chainstack; - + mutex_lock(&ebt_mutex); list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) { if (strcmp(t->name, table->name) == 0) { ret = -EEXIST; diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c new file mode 100644 index 000000000000..5d9953a90929 --- /dev/null +++ b/net/bridge/netfilter/nf_log_bridge.c @@ -0,0 +1,96 @@ +/* + * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_bridge.h> +#include <linux/ip.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_log.h> + +static void nf_log_bridge_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out, + loginfo, "%s", prefix); + break; + case htons(ETH_P_IPV6): + nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out, + loginfo, "%s", prefix); + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out, + loginfo, "%s", prefix); + break; + } +} + +static struct nf_logger nf_bridge_logger __read_mostly = { + .name = "nf_log_bridge", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_bridge_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_bridge_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); + return 0; +} + +static void __net_exit nf_log_bridge_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_bridge_logger); +} + +static struct pernet_operations nf_log_bridge_net_ops = { + .init = nf_log_bridge_net_init, + .exit = nf_log_bridge_net_exit, +}; + +static int __init nf_log_bridge_init(void) +{ + int ret; + + /* Request to load the real packet loggers. */ + nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG); + nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG); + nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG); + + ret = register_pernet_subsys(&nf_log_bridge_net_ops); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger); + return 0; +} + +static void __exit nf_log_bridge_exit(void) +{ + unregister_pernet_subsys(&nf_log_bridge_net_ops); + nf_log_unregister(&nf_bridge_logger); +} + +module_init(nf_log_bridge_init); +module_exit(nf_log_bridge_exit); + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter bridge packet logging"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0); diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 5bcc0d8b31f2..da17a5eab8b4 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -34,9 +34,11 @@ static struct nft_af_info nft_af_bridge __read_mostly = { .owner = THIS_MODULE, .nops = 1, .hooks = { + [NF_BR_PRE_ROUTING] = nft_do_chain_bridge, [NF_BR_LOCAL_IN] = nft_do_chain_bridge, [NF_BR_FORWARD] = nft_do_chain_bridge, [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, + [NF_BR_POST_ROUTING] = nft_do_chain_bridge, }, }; diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c new file mode 100644 index 000000000000..a76479535df2 --- /dev/null +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2014 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/ipv6/nf_reject.h> + +static void nft_reject_bridge_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + + switch (eth_hdr(pkt->skb)->h_proto) { + case htons(ETH_P_IP): + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach(pkt->skb, + nft_reject_icmp_code(priv->icmp_code)); + break; + } + break; + case htons(ETH_P_IPV6): + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach6(net, pkt->skb, + nft_reject_icmpv6_code(priv->icmp_code), + pkt->ops->hooknum); + break; + } + break; + default: + /* No explicit way to reject this protocol, drop it. */ + break; + } + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static int nft_reject_bridge_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + int icmp_code; + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + + icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + if (priv->type == NFT_REJECT_ICMPX_UNREACH && + icmp_code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + priv->icmp_code = icmp_code; + break; + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; + } + return 0; +} + +static int nft_reject_bridge_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_reject_bridge_type; +static const struct nft_expr_ops nft_reject_bridge_ops = { + .type = &nft_reject_bridge_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_bridge_eval, + .init = nft_reject_bridge_init, + .dump = nft_reject_bridge_dump, +}; + +static struct nft_expr_type nft_reject_bridge_type __read_mostly = { + .family = NFPROTO_BRIDGE, + .name = "reject", + .ops = &nft_reject_bridge_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_bridge_module_init(void) +{ + return nft_register_expr(&nft_reject_bridge_type); +} + +static void __exit nft_reject_bridge_module_exit(void) +{ + nft_unregister_expr(&nft_reject_bridge_type); +} + +module_init(nft_reject_bridge_module_init); +module_exit(nft_reject_bridge_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "reject"); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index e8437094d15f..43f750e88e19 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -908,8 +908,7 @@ static int caif_release(struct socket *sock) sock->sk = NULL; WARN_ON(IS_ERR(cf_sk->debugfs_socket_dir)); - if (cf_sk->debugfs_socket_dir != NULL) - debugfs_remove_recursive(cf_sk->debugfs_socket_dir); + debugfs_remove_recursive(cf_sk->debugfs_socket_dir); lock_sock(&(cf_sk->sk)); sk->sk_state = CAIF_DISCONNECTED; diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 0f455227da83..f5afda1abc76 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -547,7 +547,6 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) default: pr_err("Unrecognized Control Frame\n"); goto error; - break; } ret = 0; error: diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 96238ba95f2b..de6662b14e1f 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -13,8 +13,6 @@ #include "auth_x.h" #include "auth_x_protocol.h" -#define TEMP_TICKET_BUF_LEN 256 - static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); static int ceph_x_is_authenticated(struct ceph_auth_client *ac) @@ -64,7 +62,7 @@ static int ceph_x_encrypt(struct ceph_crypto_key *secret, } static int ceph_x_decrypt(struct ceph_crypto_key *secret, - void **p, void *end, void *obuf, size_t olen) + void **p, void *end, void **obuf, size_t olen) { struct ceph_x_encrypt_header head; size_t head_len = sizeof(head); @@ -75,8 +73,14 @@ static int ceph_x_decrypt(struct ceph_crypto_key *secret, return -EINVAL; dout("ceph_x_decrypt len %d\n", len); - ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen, - *p, len); + if (*obuf == NULL) { + *obuf = kmalloc(len, GFP_NOFS); + if (!*obuf) + return -ENOMEM; + olen = len; + } + + ret = ceph_decrypt2(secret, &head, &head_len, *obuf, &olen, *p, len); if (ret) return ret; if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC) @@ -129,139 +133,120 @@ static void remove_ticket_handler(struct ceph_auth_client *ac, kfree(th); } -static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, - struct ceph_crypto_key *secret, - void *buf, void *end) +static int process_one_ticket(struct ceph_auth_client *ac, + struct ceph_crypto_key *secret, + void **p, void *end) { struct ceph_x_info *xi = ac->private; - int num; - void *p = buf; + int type; + u8 tkt_struct_v, blob_struct_v; + struct ceph_x_ticket_handler *th; + void *dbuf = NULL; + void *dp, *dend; + int dlen; + char is_enc; + struct timespec validity; + struct ceph_crypto_key old_key; + void *ticket_buf = NULL; + void *tp, *tpend; + struct ceph_timespec new_validity; + struct ceph_crypto_key new_session_key; + struct ceph_buffer *new_ticket_blob; + unsigned long new_expires, new_renew_after; + u64 new_secret_id; int ret; - char *dbuf; - char *ticket_buf; - u8 reply_struct_v; - dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); - if (!dbuf) - return -ENOMEM; + ceph_decode_need(p, end, sizeof(u32) + 1, bad); - ret = -ENOMEM; - ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); - if (!ticket_buf) - goto out_dbuf; + type = ceph_decode_32(p); + dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); - ceph_decode_need(&p, end, 1 + sizeof(u32), bad); - reply_struct_v = ceph_decode_8(&p); - if (reply_struct_v != 1) + tkt_struct_v = ceph_decode_8(p); + if (tkt_struct_v != 1) goto bad; - num = ceph_decode_32(&p); - dout("%d tickets\n", num); - while (num--) { - int type; - u8 tkt_struct_v, blob_struct_v; - struct ceph_x_ticket_handler *th; - void *dp, *dend; - int dlen; - char is_enc; - struct timespec validity; - struct ceph_crypto_key old_key; - void *tp, *tpend; - struct ceph_timespec new_validity; - struct ceph_crypto_key new_session_key; - struct ceph_buffer *new_ticket_blob; - unsigned long new_expires, new_renew_after; - u64 new_secret_id; - - ceph_decode_need(&p, end, sizeof(u32) + 1, bad); - - type = ceph_decode_32(&p); - dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); - - tkt_struct_v = ceph_decode_8(&p); - if (tkt_struct_v != 1) - goto bad; - - th = get_ticket_handler(ac, type); - if (IS_ERR(th)) { - ret = PTR_ERR(th); - goto out; - } - /* blob for me */ - dlen = ceph_x_decrypt(secret, &p, end, dbuf, - TEMP_TICKET_BUF_LEN); - if (dlen <= 0) { - ret = dlen; - goto out; - } - dout(" decrypted %d bytes\n", dlen); - dend = dbuf + dlen; - dp = dbuf; + th = get_ticket_handler(ac, type); + if (IS_ERR(th)) { + ret = PTR_ERR(th); + goto out; + } - tkt_struct_v = ceph_decode_8(&dp); - if (tkt_struct_v != 1) - goto bad; + /* blob for me */ + dlen = ceph_x_decrypt(secret, p, end, &dbuf, 0); + if (dlen <= 0) { + ret = dlen; + goto out; + } + dout(" decrypted %d bytes\n", dlen); + dp = dbuf; + dend = dp + dlen; - memcpy(&old_key, &th->session_key, sizeof(old_key)); - ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); - if (ret) - goto out; + tkt_struct_v = ceph_decode_8(&dp); + if (tkt_struct_v != 1) + goto bad; - ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); - ceph_decode_timespec(&validity, &new_validity); - new_expires = get_seconds() + validity.tv_sec; - new_renew_after = new_expires - (validity.tv_sec / 4); - dout(" expires=%lu renew_after=%lu\n", new_expires, - new_renew_after); + memcpy(&old_key, &th->session_key, sizeof(old_key)); + ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); + if (ret) + goto out; - /* ticket blob for service */ - ceph_decode_8_safe(&p, end, is_enc, bad); - tp = ticket_buf; - if (is_enc) { - /* encrypted */ - dout(" encrypted ticket\n"); - dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf, - TEMP_TICKET_BUF_LEN); - if (dlen < 0) { - ret = dlen; - goto out; - } - dlen = ceph_decode_32(&tp); - } else { - /* unencrypted */ - ceph_decode_32_safe(&p, end, dlen, bad); - ceph_decode_need(&p, end, dlen, bad); - ceph_decode_copy(&p, ticket_buf, dlen); + ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); + ceph_decode_timespec(&validity, &new_validity); + new_expires = get_seconds() + validity.tv_sec; + new_renew_after = new_expires - (validity.tv_sec / 4); + dout(" expires=%lu renew_after=%lu\n", new_expires, + new_renew_after); + + /* ticket blob for service */ + ceph_decode_8_safe(p, end, is_enc, bad); + if (is_enc) { + /* encrypted */ + dout(" encrypted ticket\n"); + dlen = ceph_x_decrypt(&old_key, p, end, &ticket_buf, 0); + if (dlen < 0) { + ret = dlen; + goto out; } - tpend = tp + dlen; - dout(" ticket blob is %d bytes\n", dlen); - ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); - blob_struct_v = ceph_decode_8(&tp); - new_secret_id = ceph_decode_64(&tp); - ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); - if (ret) + tp = ticket_buf; + dlen = ceph_decode_32(&tp); + } else { + /* unencrypted */ + ceph_decode_32_safe(p, end, dlen, bad); + ticket_buf = kmalloc(dlen, GFP_NOFS); + if (!ticket_buf) { + ret = -ENOMEM; goto out; - - /* all is well, update our ticket */ - ceph_crypto_key_destroy(&th->session_key); - if (th->ticket_blob) - ceph_buffer_put(th->ticket_blob); - th->session_key = new_session_key; - th->ticket_blob = new_ticket_blob; - th->validity = new_validity; - th->secret_id = new_secret_id; - th->expires = new_expires; - th->renew_after = new_renew_after; - dout(" got ticket service %d (%s) secret_id %lld len %d\n", - type, ceph_entity_type_name(type), th->secret_id, - (int)th->ticket_blob->vec.iov_len); - xi->have_keys |= th->service; + } + tp = ticket_buf; + ceph_decode_need(p, end, dlen, bad); + ceph_decode_copy(p, ticket_buf, dlen); } + tpend = tp + dlen; + dout(" ticket blob is %d bytes\n", dlen); + ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); + blob_struct_v = ceph_decode_8(&tp); + new_secret_id = ceph_decode_64(&tp); + ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); + if (ret) + goto out; + + /* all is well, update our ticket */ + ceph_crypto_key_destroy(&th->session_key); + if (th->ticket_blob) + ceph_buffer_put(th->ticket_blob); + th->session_key = new_session_key; + th->ticket_blob = new_ticket_blob; + th->validity = new_validity; + th->secret_id = new_secret_id; + th->expires = new_expires; + th->renew_after = new_renew_after; + dout(" got ticket service %d (%s) secret_id %lld len %d\n", + type, ceph_entity_type_name(type), th->secret_id, + (int)th->ticket_blob->vec.iov_len); + xi->have_keys |= th->service; - ret = 0; out: kfree(ticket_buf); -out_dbuf: kfree(dbuf); return ret; @@ -270,6 +255,34 @@ bad: goto out; } +static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, + struct ceph_crypto_key *secret, + void *buf, void *end) +{ + void *p = buf; + u8 reply_struct_v; + u32 num; + int ret; + + ceph_decode_8_safe(&p, end, reply_struct_v, bad); + if (reply_struct_v != 1) + return -EINVAL; + + ceph_decode_32_safe(&p, end, num, bad); + dout("%d tickets\n", num); + + while (num--) { + ret = process_one_ticket(ac, secret, &p, end); + if (ret) + return ret; + } + + return 0; + +bad: + return -EINVAL; +} + static int ceph_x_build_authorizer(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th, struct ceph_x_authorizer *au) @@ -583,13 +596,14 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th; int ret = 0; struct ceph_x_authorize_reply reply; + void *preply = &reply; void *p = au->reply_buf; void *end = p + sizeof(au->reply_buf); th = get_ticket_handler(ac, au->service); if (IS_ERR(th)) return PTR_ERR(th); - ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); + ret = ceph_x_decrypt(&th->session_key, &p, end, &preply, sizeof(reply)); if (ret < 0) return ret; if (ret != sizeof(reply)) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 1948d592aa54..b2f571dd933d 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -174,6 +174,7 @@ static struct lock_class_key socket_class; #define SKIP_BUF_SIZE 1024 static void queue_con(struct ceph_connection *con); +static void cancel_con(struct ceph_connection *con); static void con_work(struct work_struct *); static void con_fault(struct ceph_connection *con); @@ -680,7 +681,7 @@ void ceph_con_close(struct ceph_connection *con) reset_connection(con); con->peer_global_seq = 0; - cancel_delayed_work(&con->work); + cancel_con(con); con_close_socket(con); mutex_unlock(&con->mutex); } @@ -900,7 +901,7 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, BUG_ON(page_count > (int)USHRT_MAX); cursor->page_count = (unsigned short)page_count; BUG_ON(length > SIZE_MAX - cursor->page_offset); - cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE; + cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE; } static struct page * @@ -2667,19 +2668,16 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay) { if (!con->ops->get(con)) { dout("%s %p ref count 0\n", __func__, con); - return -ENOENT; } if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { dout("%s %p - already queued\n", __func__, con); con->ops->put(con); - return -EBUSY; } dout("%s %p %lu\n", __func__, con, delay); - return 0; } @@ -2688,6 +2686,14 @@ static void queue_con(struct ceph_connection *con) (void) queue_con_delay(con, 0); } +static void cancel_con(struct ceph_connection *con) +{ + if (cancel_delayed_work(&con->work)) { + dout("%s %p\n", __func__, con); + con->ops->put(con); + } +} + static bool con_sock_closed(struct ceph_connection *con) { if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED)) @@ -3269,24 +3275,21 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) /* * Free a generically kmalloc'd message. */ -void ceph_msg_kfree(struct ceph_msg *m) +static void ceph_msg_free(struct ceph_msg *m) { - dout("msg_kfree %p\n", m); + dout("%s %p\n", __func__, m); ceph_kvfree(m->front.iov_base); kmem_cache_free(ceph_msg_cache, m); } -/* - * Drop a msg ref. Destroy as needed. - */ -void ceph_msg_last_put(struct kref *kref) +static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); LIST_HEAD(data); struct list_head *links; struct list_head *next; - dout("ceph_msg_put last one on %p\n", m); + dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); /* drop middle, data, if any */ @@ -3308,9 +3311,25 @@ void ceph_msg_last_put(struct kref *kref) if (m->pool) ceph_msgpool_put(m->pool, m); else - ceph_msg_kfree(m); + ceph_msg_free(m); +} + +struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) +{ + dout("%s %p (was %d)\n", __func__, msg, + atomic_read(&msg->kref.refcount)); + kref_get(&msg->kref); + return msg; +} +EXPORT_SYMBOL(ceph_msg_get); + +void ceph_msg_put(struct ceph_msg *msg) +{ + dout("%s %p (was %d)\n", __func__, msg, + atomic_read(&msg->kref.refcount)); + kref_put(&msg->kref, ceph_msg_release); } -EXPORT_SYMBOL(ceph_msg_last_put); +EXPORT_SYMBOL(ceph_msg_put); void ceph_msg_dump(struct ceph_msg *msg) { diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 067d3af2eaf6..61fcfc304f68 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1181,7 +1181,15 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, if (!m) { pr_info("alloc_msg unknown type %d\n", type); *skip = 1; + } else if (front_len > m->front_alloc_len) { + pr_warning("mon_alloc_msg front %d > prealloc %d (%u#%llu)\n", + front_len, m->front_alloc_len, + (unsigned int)con->peer_name.type, + le64_to_cpu(con->peer_name.num)); + ceph_msg_put(m); + m = ceph_msg_new(type, front_len, GFP_NOFS, false); } + return m; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 05be0c181695..30f6faf3584f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -297,12 +297,21 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, /* * requests */ -void ceph_osdc_release_request(struct kref *kref) +static void ceph_osdc_release_request(struct kref *kref) { - struct ceph_osd_request *req; + struct ceph_osd_request *req = container_of(kref, + struct ceph_osd_request, r_kref); unsigned int which; - req = container_of(kref, struct ceph_osd_request, r_kref); + dout("%s %p (r_request %p r_reply %p)\n", __func__, req, + req->r_request, req->r_reply); + WARN_ON(!RB_EMPTY_NODE(&req->r_node)); + WARN_ON(!list_empty(&req->r_req_lru_item)); + WARN_ON(!list_empty(&req->r_osd_item)); + WARN_ON(!list_empty(&req->r_linger_item)); + WARN_ON(!list_empty(&req->r_linger_osd_item)); + WARN_ON(req->r_osd); + if (req->r_request) ceph_msg_put(req->r_request); if (req->r_reply) { @@ -320,7 +329,22 @@ void ceph_osdc_release_request(struct kref *kref) kmem_cache_free(ceph_osd_request_cache, req); } -EXPORT_SYMBOL(ceph_osdc_release_request); + +void ceph_osdc_get_request(struct ceph_osd_request *req) +{ + dout("%s %p (was %d)\n", __func__, req, + atomic_read(&req->r_kref.refcount)); + kref_get(&req->r_kref); +} +EXPORT_SYMBOL(ceph_osdc_get_request); + +void ceph_osdc_put_request(struct ceph_osd_request *req) +{ + dout("%s %p (was %d)\n", __func__, req, + atomic_read(&req->r_kref.refcount)); + kref_put(&req->r_kref, ceph_osdc_release_request); +} +EXPORT_SYMBOL(ceph_osdc_put_request); struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, @@ -364,7 +388,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); - INIT_LIST_HEAD(&req->r_linger_osd); + INIT_LIST_HEAD(&req->r_linger_osd_item); INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); @@ -916,7 +940,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, * list at the end to keep things in tid order. */ list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, - r_linger_osd) { + r_linger_osd_item) { /* * reregister request prior to unregistering linger so * that r_osd is preserved. @@ -1008,6 +1032,8 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) { dout("__remove_osd %p\n", osd); BUG_ON(!list_empty(&osd->o_requests)); + BUG_ON(!list_empty(&osd->o_linger_requests)); + rb_erase(&osd->o_node, &osdc->osds); list_del_init(&osd->o_osd_lru); ceph_con_close(&osd->o_con); @@ -1029,12 +1055,23 @@ static void remove_all_osds(struct ceph_osd_client *osdc) static void __move_osd_to_lru(struct ceph_osd_client *osdc, struct ceph_osd *osd) { - dout("__move_osd_to_lru %p\n", osd); + dout("%s %p\n", __func__, osd); BUG_ON(!list_empty(&osd->o_osd_lru)); + list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; } +static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + dout("%s %p\n", __func__, osd); + + if (list_empty(&osd->o_requests) && + list_empty(&osd->o_linger_requests)) + __move_osd_to_lru(osdc, osd); +} + static void __remove_osd_from_lru(struct ceph_osd *osd) { dout("__remove_osd_from_lru %p\n", osd); @@ -1175,6 +1212,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, dout("__unregister_request %p tid %lld\n", req, req->r_tid); rb_erase(&req->r_node, &osdc->requests); + RB_CLEAR_NODE(&req->r_node); osdc->num_requests--; if (req->r_osd) { @@ -1182,12 +1220,8 @@ static void __unregister_request(struct ceph_osd_client *osdc, ceph_msg_revoke(req->r_request); list_del_init(&req->r_osd_item); - if (list_empty(&req->r_osd->o_requests) && - list_empty(&req->r_osd->o_linger_requests)) { - dout("moving osd to %p lru\n", req->r_osd); - __move_osd_to_lru(osdc, req->r_osd); - } - if (list_empty(&req->r_linger_item)) + maybe_move_osd_to_lru(osdc, req->r_osd); + if (list_empty(&req->r_linger_osd_item)) req->r_osd = NULL; } @@ -1214,45 +1248,39 @@ static void __cancel_request(struct ceph_osd_request *req) static void __register_linger_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { - dout("__register_linger_request %p\n", req); + dout("%s %p tid %llu\n", __func__, req, req->r_tid); + WARN_ON(!req->r_linger); + ceph_osdc_get_request(req); list_add_tail(&req->r_linger_item, &osdc->req_linger); if (req->r_osd) - list_add_tail(&req->r_linger_osd, + list_add_tail(&req->r_linger_osd_item, &req->r_osd->o_linger_requests); } static void __unregister_linger_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { - dout("__unregister_linger_request %p\n", req); + WARN_ON(!req->r_linger); + + if (list_empty(&req->r_linger_item)) { + dout("%s %p tid %llu not registered\n", __func__, req, + req->r_tid); + return; + } + + dout("%s %p tid %llu\n", __func__, req, req->r_tid); list_del_init(&req->r_linger_item); - if (req->r_osd) { - list_del_init(&req->r_linger_osd); - if (list_empty(&req->r_osd->o_requests) && - list_empty(&req->r_osd->o_linger_requests)) { - dout("moving osd to %p lru\n", req->r_osd); - __move_osd_to_lru(osdc, req->r_osd); - } + if (req->r_osd) { + list_del_init(&req->r_linger_osd_item); + maybe_move_osd_to_lru(osdc, req->r_osd); if (list_empty(&req->r_osd_item)) req->r_osd = NULL; } ceph_osdc_put_request(req); } -void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - mutex_lock(&osdc->request_mutex); - if (req->r_linger) { - req->r_linger = 0; - __unregister_linger_request(osdc, req); - } - mutex_unlock(&osdc->request_mutex); -} -EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); - void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { @@ -2430,6 +2458,25 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, EXPORT_SYMBOL(ceph_osdc_start_request); /* + * Unregister a registered request. The request is not completed (i.e. + * no callbacks or wakeups) - higher layers are supposed to know what + * they are canceling. + */ +void ceph_osdc_cancel_request(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + + mutex_lock(&osdc->request_mutex); + if (req->r_linger) + __unregister_linger_request(osdc, req); + __unregister_request(osdc, req); + mutex_unlock(&osdc->request_mutex); + + dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid); +} +EXPORT_SYMBOL(ceph_osdc_cancel_request); + +/* * wait for a request to complete */ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, @@ -2437,18 +2484,18 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, { int rc; + dout("%s %p tid %llu\n", __func__, req, req->r_tid); + rc = wait_for_completion_interruptible(&req->r_completion); if (rc < 0) { - mutex_lock(&osdc->request_mutex); - __cancel_request(req); - __unregister_request(osdc, req); - mutex_unlock(&osdc->request_mutex); + dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); + ceph_osdc_cancel_request(req); complete_request(req); - dout("wait_request tid %llu canceled/timed out\n", req->r_tid); return rc; } - dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); + dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid, + req->r_result); return req->r_result; } EXPORT_SYMBOL(ceph_osdc_wait_request); diff --git a/net/core/Makefile b/net/core/Makefile index 71093d94ad2b..235e6c50708d 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,7 +16,6 @@ obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o -obj-$(CONFIG_NET_DMA) += user_dma.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 488dd1a825c0..fdbc9a81d4c2 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -775,7 +775,7 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) EXPORT_SYMBOL(__skb_checksum_complete); /** - * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. + * skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec. * @skb: skbuff * @hlen: hardware length * @iov: io vector diff --git a/net/core/dev.c b/net/core/dev.c index 367a586d0c8a..4699dcfdc4ab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -132,6 +132,7 @@ #include <linux/hashtable.h> #include <linux/vmalloc.h> #include <linux/if_macvlan.h> +#include <linux/errqueue.h> #include "net-sysfs.h" @@ -896,23 +897,25 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * dev_get_by_flags_rcu - find any device with given flags + * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside - * rcu_read_lock(), and result refcount is unchanged. + * rtnl_lock(), and result refcount is unchanged. */ -struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, - unsigned short mask) +struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, + unsigned short mask) { struct net_device *dev, *ret; + ASSERT_RTNL(); + ret = NULL; - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; @@ -920,7 +923,7 @@ struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags } return ret; } -EXPORT_SYMBOL(dev_get_by_flags_rcu); +EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device @@ -1085,6 +1088,7 @@ static int dev_get_valid_name(struct net *net, */ int dev_change_name(struct net_device *dev, const char *newname) { + unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; @@ -1112,10 +1116,17 @@ int dev_change_name(struct net_device *dev, const char *newname) return err; } + if (oldname[0] && !strchr(oldname, '%')) + netdev_info(dev, "renamed from %s\n", oldname); + + old_assign_type = dev->name_assign_type; + dev->name_assign_type = NET_NAME_RENAMED; + rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; write_seqcount_end(&devnet_rename_seq); return ret; } @@ -1144,6 +1155,8 @@ rollback: write_seqcount_begin(&devnet_rename_seq); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; + old_assign_type = NET_NAME_RENAMED; goto rollback; } else { pr_err("%s: name change rollback failed: %d\n", @@ -1273,7 +1286,6 @@ static int __dev_open(struct net_device *dev) clear_bit(__LINK_STATE_START, &dev->state); else { dev->flags |= IFF_UP; - net_dmaengine_get(); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -1352,7 +1364,6 @@ static int __dev_close_many(struct list_head *head) ops->ndo_stop(dev); dev->flags &= ~IFF_UP; - net_dmaengine_put(); netpoll_poll_enable(dev); } @@ -2166,6 +2177,53 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) return (struct dev_kfree_skb_cb *)skb->cb; } +void netif_schedule_queue(struct netdev_queue *txq) +{ + rcu_read_lock(); + if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + struct Qdisc *q = rcu_dereference(txq->qdisc); + + __netif_schedule(q); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(netif_schedule_queue); + +/** + * netif_wake_subqueue - allow sending packets on subqueue + * @dev: network device + * @queue_index: sub queue index + * + * Resume individual transmit queue of a device with multiple transmit queues. + */ +void netif_wake_subqueue(struct net_device *dev, u16 queue_index) +{ + struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); + + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(txq->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_wake_subqueue); + +void netif_tx_wake_queue(struct netdev_queue *dev_queue) +{ + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(dev_queue->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_tx_wake_queue); + void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) { unsigned long flags; @@ -2316,7 +2374,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) */ if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { if (vlan_depth) { - if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN))) + if (WARN_ON(vlan_depth < VLAN_HLEN)) return 0; vlan_depth -= VLAN_HLEN; } else { @@ -2362,16 +2420,6 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { - if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - int err; - - err = ptype->callbacks.gso_send_check(skb); - segs = ERR_PTR(err); - if (err || skb_gso_ok(skb, features)) - break; - __skb_push(skb, (skb->data - - skb_network_header(skb))); - } segs = ptype->callbacks.gso_segment(skb, features); break; } @@ -2414,8 +2462,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, skb_warn_bad_offload(skb); - if (skb_header_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + err = skb_cow_head(skb, 0); + if (err < 0) return ERR_PTR(err); } @@ -2474,52 +2522,6 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) return 0; } -struct dev_gso_cb { - void (*destructor)(struct sk_buff *skb); -}; - -#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) - -static void dev_gso_skb_destructor(struct sk_buff *skb) -{ - struct dev_gso_cb *cb; - - kfree_skb_list(skb->next); - skb->next = NULL; - - cb = DEV_GSO_CB(skb); - if (cb->destructor) - cb->destructor(skb); -} - -/** - * dev_gso_segment - Perform emulated hardware segmentation on skb. - * @skb: buffer to segment - * @features: device features as applicable to this skb - * - * This function segments the given skb and stores the list of segments - * in skb->next. - */ -static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) -{ - struct sk_buff *segs; - - segs = skb_gso_segment(skb, features); - - /* Verifying header integrity only. */ - if (!segs) - return 0; - - if (IS_ERR(segs)) - return PTR_ERR(segs); - - skb->next = segs; - DEV_GSO_CB(skb)->destructor = skb->destructor; - skb->destructor = dev_gso_skb_destructor; - - return 0; -} - /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ @@ -2563,10 +2565,12 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t netif_skb_features(struct sk_buff *skb) { + const struct net_device *dev = skb->dev; + netdev_features_t features = dev->features; + u16 gso_segs = skb_shinfo(skb)->gso_segs; __be16 protocol = skb->protocol; - netdev_features_t features = skb->dev->features; - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) features &= ~NETIF_F_GSO_MASK; if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { @@ -2576,131 +2580,167 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) return harmonize_features(skb, features); } - features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); + features = netdev_intersect_features(features, + dev->vlan_features | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) - features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX; + features = netdev_intersect_features(features, + NETIF_F_SG | + NETIF_F_HIGHDMA | + NETIF_F_FRAGLIST | + NETIF_F_GEN_CSUM | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); -int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +static int xmit_one(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, bool more) { - const struct net_device_ops *ops = dev->netdev_ops; - int rc = NETDEV_TX_OK; - unsigned int skb_len; - - if (likely(!skb->next)) { - netdev_features_t features; + unsigned int len; + int rc; - /* - * If device doesn't need skb->dst, release it right now while - * its hot in this cpu cache - */ - if (dev->priv_flags & IFF_XMIT_DST_RELEASE) - skb_dst_drop(skb); + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); - features = netif_skb_features(skb); + len = skb->len; + trace_net_dev_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev, txq, more); + trace_net_dev_xmit(skb, rc, dev, len); - if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (unlikely(!skb)) - goto out; + return rc; +} - skb->vlan_tci = 0; - } +struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, + struct netdev_queue *txq, int *ret) +{ + struct sk_buff *skb = first; + int rc = NETDEV_TX_OK; - /* If encapsulation offload request, verify we are testing - * hardware encapsulation features instead of standard - * features for the netdev - */ - if (skb->encapsulation) - features &= dev->hw_enc_features; + while (skb) { + struct sk_buff *next = skb->next; - if (netif_needs_gso(skb, features)) { - if (unlikely(dev_gso_segment(skb, features))) - goto out_kfree_skb; - if (skb->next) - goto gso; - } else { - if (skb_needs_linearize(skb, features) && - __skb_linearize(skb)) - goto out_kfree_skb; + skb->next = NULL; + rc = xmit_one(skb, dev, txq, next != NULL); + if (unlikely(!dev_xmit_complete(rc))) { + skb->next = next; + goto out; + } - /* If packet is not checksummed and device does not - * support checksumming for this protocol, complete - * checksumming here. - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb->encapsulation) - skb_set_inner_transport_header(skb, - skb_checksum_start_offset(skb)); - else - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_ALL_CSUM) && - skb_checksum_help(skb)) - goto out_kfree_skb; - } + skb = next; + if (netif_xmit_stopped(txq) && skb) { + rc = NETDEV_TX_BUSY; + break; } + } - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); +out: + *ret = rc; + return skb; +} - skb_len = skb->len; - trace_net_dev_start_xmit(skb, dev); - rc = ops->ndo_start_xmit(skb, dev); - trace_net_dev_xmit(skb, rc, dev, skb_len); - if (rc == NETDEV_TX_OK) - txq_trans_update(txq); - return rc; +static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, + netdev_features_t features) +{ + if (vlan_tx_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) { + skb = __vlan_put_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (skb) + skb->vlan_tci = 0; } + return skb; +} + +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +{ + netdev_features_t features; -gso: - do { - struct sk_buff *nskb = skb->next; + if (skb->next) + return skb; - skb->next = nskb->next; - nskb->next = NULL; + features = netif_skb_features(skb); + skb = validate_xmit_vlan(skb, features); + if (unlikely(!skb)) + goto out_null; - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(nskb, dev); - - skb_len = nskb->len; - trace_net_dev_start_xmit(nskb, dev); - rc = ops->ndo_start_xmit(nskb, dev); - trace_net_dev_xmit(nskb, rc, dev, skb_len); - if (unlikely(rc != NETDEV_TX_OK)) { - if (rc & ~NETDEV_TX_MASK) - goto out_kfree_gso_skb; - nskb->next = skb->next; - skb->next = nskb; - return rc; + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; + + if (netif_needs_gso(skb, features)) { + struct sk_buff *segs; + + segs = skb_gso_segment(skb, features); + if (IS_ERR(segs)) { + segs = NULL; + } else if (segs) { + consume_skb(skb); + skb = segs; } - txq_trans_update(txq); - if (unlikely(netif_xmit_stopped(txq) && skb->next)) - return NETDEV_TX_BUSY; - } while (skb->next); + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; -out_kfree_gso_skb: - if (likely(skb->next == NULL)) { - skb->destructor = DEV_GSO_CB(skb)->destructor; - consume_skb(skb); - return rc; + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (!(features & NETIF_F_ALL_CSUM) && + skb_checksum_help(skb)) + goto out_kfree_skb; + } } + + return skb; + out_kfree_skb: kfree_skb(skb); -out: - return rc; +out_null: + return NULL; +} + +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) +{ + struct sk_buff *next, *head = NULL, *tail; + + for (; skb != NULL; skb = next) { + next = skb->next; + skb->next = NULL; + + /* in case skb wont be segmented, point to itself */ + skb->prev = skb; + + skb = validate_xmit_skb(skb, dev); + if (!skb) + continue; + + if (!head) + head = skb; + else + tail->next = skb; + /* If skb was segmented, skb->prev points to + * the last segment. If not, it still contains skb. + */ + tail = skb->prev; + } + return head; } -EXPORT_SYMBOL_GPL(dev_hard_start_xmit); static void qdisc_pkt_len_init(struct sk_buff *skb) { @@ -2745,8 +2785,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. - * This permits __QDISC_STATE_RUNNING owner to get the lock more often - * and dequeue packets faster. + * This permits __QDISC___STATE_RUNNING owner to get the lock more + * often and dequeue packets faster. */ contended = qdisc_is_running(q); if (unlikely(contended)) @@ -2763,12 +2803,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) - skb_dst_force(skb); qdisc_bstats_update(q, skb); - if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; @@ -2779,7 +2817,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { - skb_dst_force(skb); rc = q->enqueue(skb, q) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { @@ -2866,6 +2903,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_reset_mac_header(skb); + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); + /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ @@ -2873,6 +2913,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_update_prio(skb); + /* If device/qdisc don't need skb->dst, release it right now while + * its hot in this cpu cache. + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + else + skb_dst_force(skb); + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -2905,11 +2953,15 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) goto recursion_alert; + skb = validate_xmit_skb(skb, dev); + if (!skb) + goto drop; + HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); - rc = dev_hard_start_xmit(skb, dev, txq); + skb = dev_hard_start_xmit(skb, dev, txq, &rc); __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); @@ -2930,10 +2982,11 @@ recursion_alert: } rc = -ENETDOWN; +drop: rcu_read_unlock_bh(); atomic_long_inc(&dev->tx_dropped); - kfree_skb(skb); + kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); @@ -3110,8 +3163,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } if (map) { - tcpu = map->cpus[((u64) hash * map->len) >> 32]; - + tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; @@ -3447,7 +3499,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - q = rxq->qdisc; + q = rcu_dereference(rxq->qdisc); if (q != &noop_qdisc) { spin_lock(qdisc_lock(q)); if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) @@ -3464,7 +3516,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, { struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); - if (!rxq || rxq->qdisc == &noop_qdisc) + if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) goto out; if (*pt_prev) { @@ -3588,7 +3640,7 @@ another_round: if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || skb->protocol == cpu_to_be16(ETH_P_8021AD)) { - skb = vlan_untag(skb); + skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto unlock; } @@ -3945,11 +3997,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (!(skb->dev->features & NETIF_F_GRO)) goto normal; - if (skb_is_gso(skb) || skb_has_frag_list(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) goto normal; gro_list_prepare(napi, skb); - NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -3963,6 +4014,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->udp_mark = 0; + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + NAPI_GRO_CB(skb)->csum_cnt = 0; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + NAPI_GRO_CB(skb)->csum_valid = 0; + break; + default: + NAPI_GRO_CB(skb)->csum_cnt = 0; + NAPI_GRO_CB(skb)->csum_valid = 0; + } + pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; } @@ -4192,6 +4259,31 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_frags); +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); + /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. @@ -4485,14 +4577,6 @@ static void net_rx_action(struct softirq_action *h) out: net_rps_action_and_irq_enable(sd); -#ifdef CONFIG_NET_DMA - /* - * There may not be any more sk_buffs coming right now, so push - * any pending DMA copies to hardware - */ - dma_issue_pending_all(); -#endif - return; softnet_break: @@ -4789,9 +4873,14 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev, sysfs_remove_link(&(dev->dev.kobj), linkname); } -#define netdev_adjacent_is_neigh_list(dev, dev_list) \ - (dev_list == &dev->adj_list.upper || \ - dev_list == &dev->adj_list.lower) +static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + return (dev_list == &dev->adj_list.upper || + dev_list == &dev->adj_list.lower) && + net_eq(dev_net(dev), dev_net(adj_dev)); +} static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, @@ -4821,7 +4910,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, pr_debug("dev_hold for %s, because of link added from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); - if (netdev_adjacent_is_neigh_list(dev, dev_list)) { + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); if (ret) goto free_adj; @@ -4842,7 +4931,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, return 0; remove_symlinks: - if (netdev_adjacent_is_neigh_list(dev, dev_list)) + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: kfree(adj); @@ -4875,7 +4964,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); - if (netdev_adjacent_is_neigh_list(dev, dev_list)) + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); @@ -5145,11 +5234,65 @@ void netdev_upper_dev_unlink(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_dev_unlink); +void netdev_adjacent_add_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.lower); + } +} + +void netdev_adjacent_del_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.lower); + } +} + void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) { struct netdev_adjacent *iter; + struct net *net = dev_net(dev); + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(iter->dev, dev, @@ -5157,6 +5300,8 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) } list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net,dev_net(iter->dev))) + continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(iter->dev, dev, @@ -5440,13 +5585,9 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) */ ret = 0; - if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ + if ((old_flags ^ flags) & IFF_UP) ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); - if (!ret) - dev_set_rx_mode(dev); - } - if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; unsigned int old_flags = dev->flags; @@ -6446,17 +6587,19 @@ void netdev_freemem(struct net_device *dev) /** * alloc_netdev_mqs - allocate network device - * @sizeof_priv: size of private data to allocate space for - * @name: device name format string - * @setup: callback to initialize device - * @txqs: the number of TX subqueues to allocate - * @rxqs: the number of RX subqueues to allocate + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @name_assign_type: origin of device name + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { @@ -6510,6 +6653,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; + dev->gso_min_segs = 0; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); @@ -6519,7 +6663,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->all_adj_list.upper); INIT_LIST_HEAD(&dev->all_adj_list.lower); - dev->priv_flags = IFF_XMIT_DST_RELEASE; + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); dev->num_tx_queues = txqs; @@ -6535,6 +6679,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #endif strcpy(dev->name, name); + dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; @@ -6760,6 +6905,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Send a netdev-removed uevent to the old namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); + netdev_adjacent_del_links(dev); /* Actually switch the network namespace */ dev_net_set(dev, net); @@ -6774,6 +6920,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + netdev_adjacent_add_links(dev); /* Fixup kobjects */ err = device_rename(&dev->dev, dev->name); @@ -6938,51 +7085,45 @@ const char *netdev_drivername(const struct net_device *dev) return empty; } -static int __netdev_printk(const char *level, const struct net_device *dev, - struct va_format *vaf) +static void __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) { - int r; - if (dev && dev->dev.parent) { - r = dev_printk_emit(level[1] - '0', - dev->dev.parent, - "%s %s %s: %pV", - dev_driver_string(dev->dev.parent), - dev_name(dev->dev.parent), - netdev_name(dev), vaf); + dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s%s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), netdev_reg_state(dev), + vaf); } else if (dev) { - r = printk("%s%s: %pV", level, netdev_name(dev), vaf); + printk("%s%s%s: %pV", + level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { - r = printk("%s(NULL net_device): %pV", level, vaf); + printk("%s(NULL net_device): %pV", level, vaf); } - - return r; } -int netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...) +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) { struct va_format vaf; va_list args; - int r; va_start(args, format); vaf.fmt = format; vaf.va = &args; - r = __netdev_printk(level, dev, &vaf); + __netdev_printk(level, dev, &vaf); va_end(args); - - return r; } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ -int func(const struct net_device *dev, const char *fmt, ...) \ +void func(const struct net_device *dev, const char *fmt, ...) \ { \ - int r; \ struct va_format vaf; \ va_list args; \ \ @@ -6991,11 +7132,9 @@ int func(const struct net_device *dev, const char *fmt, ...) \ vaf.fmt = fmt; \ vaf.va = &args; \ \ - r = __netdev_printk(level, dev, &vaf); \ + __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ - \ - return r; \ } \ EXPORT_SYMBOL(func); @@ -7103,7 +7242,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { - if (dev->rtnl_link_ops) + if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index cf999e09bcd2..72e899a3efda 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -365,11 +365,8 @@ void dev_load(struct net *net, const char *name) no_module = !dev; if (no_module && capable(CAP_NET_ADMIN)) no_module = request_module("netdev-%s", name); - if (no_module && capable(CAP_SYS_MODULE)) { - if (!request_module("%s", name)) - pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n", - name); - } + if (no_module && capable(CAP_SYS_MODULE)) + request_module("%s", name); } EXPORT_SYMBOL(dev_load); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index e70301eb7a4a..50f9a9db5792 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -289,10 +289,8 @@ static int net_dm_cmd_trace(struct sk_buff *skb, switch (info->genlhdr->cmd) { case NET_DM_CMD_START: return set_all_monitor_traces(TRACE_ON); - break; case NET_DM_CMD_STOP: return set_all_monitor_traces(TRACE_OFF); - break; } return -ENOTSUPP; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 17cb912793fa..1600aa24d36b 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1621,6 +1621,81 @@ static int ethtool_get_module_eeprom(struct net_device *dev, modinfo.eeprom_len); } +static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) +{ + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + case ETHTOOL_TX_COPYBREAK: + if (tuna->len != sizeof(u32) || + tuna->type_id != ETHTOOL_TUNABLE_U32) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->get_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + ret = ops->get_tunable(dev, &tuna, data); + if (ret) + goto out; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_to_user(useraddr, data, tuna.len)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->set_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_from_user(data, useraddr, tuna.len)) + goto out; + ret = ops->set_tunable(dev, &tuna, data); + +out: + kfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -1670,6 +1745,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: + case ETHTOOL_GTUNABLE: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -1857,6 +1933,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GMODULEEEPROM: rc = ethtool_get_module_eeprom(dev, useraddr); break; + case ETHTOOL_GTUNABLE: + rc = ethtool_get_tunable(dev, useraddr); + break; + case ETHTOOL_STUNABLE: + rc = ethtool_set_tunable(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/filter.c b/net/core/filter.c index 1dbf6462f766..647b12265e18 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -18,7 +18,7 @@ * 2 of the License, or (at your option) any later version. * * Andi Kleen - Fix a few bad bugs and races. - * Kris Katterjohn - Added many additional checks in sk_chk_filter() + * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include <linux/module.h> @@ -45,63 +45,15 @@ #include <linux/seccomp.h> #include <linux/if_vlan.h> -/* Registers */ -#define BPF_R0 regs[BPF_REG_0] -#define BPF_R1 regs[BPF_REG_1] -#define BPF_R2 regs[BPF_REG_2] -#define BPF_R3 regs[BPF_REG_3] -#define BPF_R4 regs[BPF_REG_4] -#define BPF_R5 regs[BPF_REG_5] -#define BPF_R6 regs[BPF_REG_6] -#define BPF_R7 regs[BPF_REG_7] -#define BPF_R8 regs[BPF_REG_8] -#define BPF_R9 regs[BPF_REG_9] -#define BPF_R10 regs[BPF_REG_10] - -/* Named registers */ -#define DST regs[insn->dst_reg] -#define SRC regs[insn->src_reg] -#define FP regs[BPF_REG_FP] -#define ARG1 regs[BPF_REG_ARG1] -#define CTX regs[BPF_REG_CTX] -#define IMM insn->imm - -/* No hurry in this branch - * - * Exported for the bpf jit load helper. - */ -void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) -{ - u8 *ptr = NULL; - - if (k >= SKF_NET_OFF) - ptr = skb_network_header(skb) + k - SKF_NET_OFF; - else if (k >= SKF_LL_OFF) - ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) - return ptr; - - return NULL; -} - -static inline void *load_pointer(const struct sk_buff *skb, int k, - unsigned int size, void *buffer) -{ - if (k >= 0) - return skb_header_pointer(skb, k, size, buffer); - - return bpf_internal_load_pointer_neg_helper(skb, k, size); -} - /** * sk_filter - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * * Run the filter code and then cut skb->data to correct size returned by - * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller + * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to sk_run_filter. It returns 0 if the packet should + * wrapper to SK_RUN_FILTER. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ @@ -135,478 +87,9 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sk_filter); -/* Base function for offset calculation. Needs to go into .text section, - * therefore keeping it non-static as well; will also be used by JITs - * anyway later on, so do not let the compiler omit it. - */ -noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -{ - return 0; -} - -/** - * __sk_run_filter - run a filter on a given context - * @ctx: buffer to run the filter on - * @insn: filter to apply - * - * Decode and apply filter instructions to the skb->data. Return length to - * keep, 0 for none. @ctx is the data we are operating on, @insn is the - * array of filter instructions. - */ -static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) -{ - u64 stack[MAX_BPF_STACK / sizeof(u64)]; - u64 regs[MAX_BPF_REG], tmp; - static const void *jumptable[256] = { - [0 ... 255] = &&default_label, - /* Now overwrite non-defaults ... */ - /* 32 bit ALU operations */ - [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, - [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, - [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, - [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, - [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, - [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, - [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, - [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, - [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, - [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, - [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, - [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, - [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, - [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, - [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, - [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, - [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, - [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, - [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, - [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, - [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, - [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, - [BPF_ALU | BPF_NEG] = &&ALU_NEG, - [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, - [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, - /* 64 bit ALU operations */ - [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, - [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, - [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, - [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, - [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, - [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, - [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, - [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, - [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, - [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, - [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, - [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, - [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, - [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, - [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, - [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, - [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, - [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, - [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, - [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, - [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, - [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, - [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, - [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, - [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, - /* Call instruction */ - [BPF_JMP | BPF_CALL] = &&JMP_CALL, - /* Jumps */ - [BPF_JMP | BPF_JA] = &&JMP_JA, - [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, - [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, - [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, - [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, - [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, - [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, - [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, - [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, - [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, - [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, - [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, - [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, - [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, - [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, - /* Program return */ - [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, - /* Store instructions */ - [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, - [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, - [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, - [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, - [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, - [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, - [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, - [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, - [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, - [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, - /* Load instructions */ - [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, - [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, - [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, - [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, - [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, - [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, - [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, - [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, - [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, - [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, - }; - void *ptr; - int off; - -#define CONT ({ insn++; goto select_insn; }) -#define CONT_JMP ({ insn++; goto select_insn; }) - - FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; - ARG1 = (u64) (unsigned long) ctx; - - /* Registers used in classic BPF programs need to be reset first. */ - regs[BPF_REG_A] = 0; - regs[BPF_REG_X] = 0; - -select_insn: - goto *jumptable[insn->code]; - - /* ALU */ -#define ALU(OPCODE, OP) \ - ALU64_##OPCODE##_X: \ - DST = DST OP SRC; \ - CONT; \ - ALU_##OPCODE##_X: \ - DST = (u32) DST OP (u32) SRC; \ - CONT; \ - ALU64_##OPCODE##_K: \ - DST = DST OP IMM; \ - CONT; \ - ALU_##OPCODE##_K: \ - DST = (u32) DST OP (u32) IMM; \ - CONT; - - ALU(ADD, +) - ALU(SUB, -) - ALU(AND, &) - ALU(OR, |) - ALU(LSH, <<) - ALU(RSH, >>) - ALU(XOR, ^) - ALU(MUL, *) -#undef ALU - ALU_NEG: - DST = (u32) -DST; - CONT; - ALU64_NEG: - DST = -DST; - CONT; - ALU_MOV_X: - DST = (u32) SRC; - CONT; - ALU_MOV_K: - DST = (u32) IMM; - CONT; - ALU64_MOV_X: - DST = SRC; - CONT; - ALU64_MOV_K: - DST = IMM; - CONT; - ALU64_ARSH_X: - (*(s64 *) &DST) >>= SRC; - CONT; - ALU64_ARSH_K: - (*(s64 *) &DST) >>= IMM; - CONT; - ALU64_MOD_X: - if (unlikely(SRC == 0)) - return 0; - tmp = DST; - DST = do_div(tmp, SRC); - CONT; - ALU_MOD_X: - if (unlikely(SRC == 0)) - return 0; - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); - CONT; - ALU64_MOD_K: - tmp = DST; - DST = do_div(tmp, IMM); - CONT; - ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); - CONT; - ALU64_DIV_X: - if (unlikely(SRC == 0)) - return 0; - do_div(DST, SRC); - CONT; - ALU_DIV_X: - if (unlikely(SRC == 0)) - return 0; - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; - CONT; - ALU64_DIV_K: - do_div(DST, IMM); - CONT; - ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; - CONT; - ALU_END_TO_BE: - switch (IMM) { - case 16: - DST = (__force u16) cpu_to_be16(DST); - break; - case 32: - DST = (__force u32) cpu_to_be32(DST); - break; - case 64: - DST = (__force u64) cpu_to_be64(DST); - break; - } - CONT; - ALU_END_TO_LE: - switch (IMM) { - case 16: - DST = (__force u16) cpu_to_le16(DST); - break; - case 32: - DST = (__force u32) cpu_to_le32(DST); - break; - case 64: - DST = (__force u64) cpu_to_le64(DST); - break; - } - CONT; - - /* CALL */ - JMP_CALL: - /* Function call scratches BPF_R1-BPF_R5 registers, - * preserves BPF_R6-BPF_R9, and stores return value - * into BPF_R0. - */ - BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, - BPF_R4, BPF_R5); - CONT; - - /* JMP */ - JMP_JA: - insn += insn->off; - CONT; - JMP_JEQ_X: - if (DST == SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JEQ_K: - if (DST == IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_X: - if (DST != SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_K: - if (DST != IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_X: - if (DST > SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_K: - if (DST > IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_X: - if (DST >= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_K: - if (DST >= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_X: - if (((s64) DST) > ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_K: - if (((s64) DST) > ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_X: - if (((s64) DST) >= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_K: - if (((s64) DST) >= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_X: - if (DST & SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_K: - if (DST & IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_EXIT: - return BPF_R0; - - /* STX and ST and LDX*/ -#define LDST(SIZEOP, SIZE) \ - STX_MEM_##SIZEOP: \ - *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ - CONT; \ - ST_MEM_##SIZEOP: \ - *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ - CONT; \ - LDX_MEM_##SIZEOP: \ - DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ - CONT; - - LDST(B, u8) - LDST(H, u16) - LDST(W, u32) - LDST(DW, u64) -#undef LDST - STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ - atomic_add((u32) SRC, (atomic_t *)(unsigned long) - (DST + insn->off)); - CONT; - STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ - atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) - (DST + insn->off)); - CONT; - LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ - off = IMM; -load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are - * only appearing in the programs where ctx == - * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] - * == BPF_R6, sk_convert_filter() saves it in BPF_R6, - * internal BPF verifier will check that BPF_R6 == - * ctx. - * - * BPF_ABS and BPF_IND are wrappers of function calls, - * so they scratch BPF_R1-BPF_R5 registers, preserve - * BPF_R6-BPF_R9, and store return value into BPF_R0. - * - * Implicit input: - * ctx == skb == BPF_R6 == CTX - * - * Explicit input: - * SRC == any register - * IMM == 32-bit immediate - * - * Output: - * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness - */ - - ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be32(ptr); - CONT; - } - - return 0; - LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ - off = IMM; -load_half: - ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be16(ptr); - CONT; - } - - return 0; - LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ - off = IMM; -load_byte: - ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = *(u8 *)ptr; - CONT; - } - - return 0; - LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_word; - LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_half; - LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ - off = IMM + SRC; - goto load_byte; - - default_label: - /* If we ever reach this, we have a bug somewhere. */ - WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); - return 0; -} - -/* Helper to find the offset of pkt_type in sk_buff structure. We want - * to make sure its still a 3bit field starting at a byte boundary; - * taken from arch/x86/net/bpf_jit_comp.c. - */ -#ifdef __BIG_ENDIAN_BITFIELD -#define PKT_TYPE_MAX (7 << 5) -#else -#define PKT_TYPE_MAX 7 -#endif -static unsigned int pkt_type_offset(void) -{ - struct sk_buff skb_probe = { .pkt_type = ~0, }; - u8 *ct = (u8 *) &skb_probe; - unsigned int off; - - for (off = 0; off < sizeof(struct sk_buff); off++) { - if (ct[off] == PKT_TYPE_MAX) - return off; - } - - pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__); - return -1; -} - static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { - return __skb_get_poff((struct sk_buff *)(unsigned long) ctx); + return skb_get_poff((struct sk_buff *)(unsigned long) ctx); } static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) @@ -667,9 +150,9 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) } static bool convert_bpf_extensions(struct sock_filter *fp, - struct sock_filter_int **insnp) + struct bpf_insn **insnp) { - struct sock_filter_int *insn = *insnp; + struct bpf_insn *insn = *insnp; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: @@ -683,11 +166,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_PKTTYPE: - *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, - pkt_type_offset()); - if (insn->off < 0) - return false; - insn++; + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, + PKT_TYPE_OFFSET()); *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD insn++; @@ -805,7 +285,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp, } /** - * sk_convert_filter - convert filter program + * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: buffer where converted program will be stored @@ -815,12 +295,12 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * Conversion workflow: * * 1) First pass for calculating the new program length: - * sk_convert_filter(old_prog, old_len, NULL, &new_len) + * bpf_convert_filter(old_prog, old_len, NULL, &new_len) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len); - * sk_convert_filter(old_prog, old_len, new_prog, &new_len); + * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); + * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); * * User BPF's register A is mapped to our BPF register 6, user BPF * register X is mapped to BPF register 7; frame pointer is always @@ -828,11 +308,11 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * for socket filters: ctx == 'struct sk_buff *', for seccomp: * ctx == 'struct seccomp_data *'. */ -int sk_convert_filter(struct sock_filter *prog, int len, - struct sock_filter_int *new_prog, int *new_len) +int bpf_convert_filter(struct sock_filter *prog, int len, + struct bpf_insn *new_prog, int *new_len) { int new_flen = 0, pass = 0, target, i; - struct sock_filter_int *new_insn; + struct bpf_insn *new_insn; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; @@ -858,8 +338,8 @@ do_pass: new_insn++; for (i = 0; i < len; fp++, i++) { - struct sock_filter_int tmp_insns[6] = { }; - struct sock_filter_int *insn = tmp_insns; + struct bpf_insn tmp_insns[6] = { }; + struct bpf_insn *insn = tmp_insns; if (addrs) addrs[i] = new_insn - new_prog; @@ -1086,15 +566,12 @@ err: /* Security: * - * A BPF program is able to use 16 cells of memory to store intermediate - * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()). - * * As we dont want to clear mem[] array for each packet going through - * sk_run_filter(), we check that filter loaded by user never try to read + * __bpf_prog_run(), we check that filter loaded by user never try to read * a cell if not previously written, and we check all branches to be sure * a malicious user doesn't try to abuse us. */ -static int check_load_and_stores(struct sock_filter *filter, int flen) +static int check_load_and_stores(const struct sock_filter *filter, int flen) { u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; @@ -1214,7 +691,7 @@ static bool chk_code_allowed(u16 code_to_probe) } /** - * sk_chk_filter - verify socket filter code + * bpf_check_classic - verify socket filter code * @filter: filter to verify * @flen: length of filter * @@ -1227,7 +704,7 @@ static bool chk_code_allowed(u16 code_to_probe) * * Returns 0 if the rule set is legal or -EINVAL if not. */ -int sk_chk_filter(struct sock_filter *filter, unsigned int flen) +int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) { bool anc_found; int pc; @@ -1237,7 +714,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { - struct sock_filter *ftest = &filter[pc]; + const struct sock_filter *ftest = &filter[pc]; /* May we actually operate on this code? */ if (!chk_code_allowed(ftest->code)) @@ -1301,12 +778,12 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) return -EINVAL; } -EXPORT_SYMBOL(sk_chk_filter); +EXPORT_SYMBOL(bpf_check_classic); -static int sk_store_orig_filter(struct sk_filter *fp, - const struct sock_fprog *fprog) +static int bpf_prog_store_orig_filter(struct bpf_prog *fp, + const struct sock_fprog *fprog) { - unsigned int fsize = sk_filter_proglen(fprog); + unsigned int fsize = bpf_classic_proglen(fprog); struct sock_fprog_kern *fkprog; fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); @@ -1324,7 +801,7 @@ static int sk_store_orig_filter(struct sk_filter *fp, return 0; } -static void sk_release_orig_filter(struct sk_filter *fp) +static void bpf_release_orig_filter(struct bpf_prog *fp) { struct sock_fprog_kern *fprog = fp->orig_prog; @@ -1334,6 +811,18 @@ static void sk_release_orig_filter(struct sk_filter *fp) } } +static void __bpf_prog_release(struct bpf_prog *prog) +{ + bpf_release_orig_filter(prog); + bpf_prog_free(prog); +} + +static void __sk_filter_release(struct sk_filter *fp) +{ + __bpf_prog_release(fp->prog); + kfree(fp); +} + /** * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free @@ -1342,8 +831,7 @@ static void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); - sk_release_orig_filter(fp); - sk_filter_free(fp); + __sk_filter_release(fp); } /** @@ -1360,44 +848,33 @@ static void sk_filter_release(struct sk_filter *fp) void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { - atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc); - sk_filter_release(fp); -} + u32 filter_size = bpf_prog_size(fp->prog->len); -void sk_filter_charge(struct sock *sk, struct sk_filter *fp) -{ - atomic_inc(&fp->refcnt); - atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc); + atomic_sub(filter_size, &sk->sk_omem_alloc); + sk_filter_release(fp); } -static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp, - struct sock *sk, - unsigned int len) +/* try to charge the socket memory if there is space available + * return true on success + */ +bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { - struct sk_filter *fp_new; - - if (sk == NULL) - return krealloc(fp, len, GFP_KERNEL); - - fp_new = sock_kmalloc(sk, len, GFP_KERNEL); - if (fp_new) { - *fp_new = *fp; - /* As we're keeping orig_prog in fp_new along, - * we need to make sure we're not evicting it - * from the old fp. - */ - fp->orig_prog = NULL; - sk_filter_uncharge(sk, fp); + u32 filter_size = bpf_prog_size(fp->prog->len); + + /* same check as in sock_kmalloc() */ + if (filter_size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { + atomic_inc(&fp->refcnt); + atomic_add(filter_size, &sk->sk_omem_alloc); + return true; } - - return fp_new; + return false; } -static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, - struct sock *sk) +static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; - struct sk_filter *old_fp; + struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; /* We are free to overwrite insns et al right here as it @@ -1406,7 +883,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, * representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != - sizeof(struct sock_filter_int)); + sizeof(struct bpf_insn)); /* Conversion cannot happen on overlapping memory areas, * so we need to keep the user BPF around until the 2nd @@ -1420,13 +897,13 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, } /* 1st pass: calculate the new program length. */ - err = sk_convert_filter(old_prog, old_len, NULL, &new_len); + err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); if (err) goto out_err_free; /* Expand fp for appending the new filter representation. */ old_fp = fp; - fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len)); + fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. @@ -1438,17 +915,17 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, fp->len = new_len; - /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */ - err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len); + /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ + err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); if (err) - /* 2nd sk_convert_filter() can fail only if it fails + /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, * that at this time old_fp has already been released - * by __sk_migrate_realloc(). + * by krealloc(). */ goto out_err_free; - sk_filter_select_runtime(fp); + bpf_prog_select_runtime(fp); kfree(old_prog); return fp; @@ -1456,55 +933,20 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, out_err_free: kfree(old_prog); out_err: - /* Rollback filter setup. */ - if (sk != NULL) - sk_filter_uncharge(sk, fp); - else - kfree(fp); + __bpf_prog_release(fp); return ERR_PTR(err); } -void __weak bpf_int_jit_compile(struct sk_filter *prog) -{ -} - -/** - * sk_filter_select_runtime - select execution runtime for BPF program - * @fp: sk_filter populated with internal BPF program - * - * try to JIT internal BPF program, if JIT is not available select interpreter - * BPF program will be executed via SK_RUN_FILTER() macro - */ -void sk_filter_select_runtime(struct sk_filter *fp) -{ - fp->bpf_func = (void *) __sk_run_filter; - - /* Probe if internal BPF can be JITed */ - bpf_int_jit_compile(fp); -} -EXPORT_SYMBOL_GPL(sk_filter_select_runtime); - -/* free internal BPF program */ -void sk_filter_free(struct sk_filter *fp) -{ - bpf_jit_free(fp); -} -EXPORT_SYMBOL_GPL(sk_filter_free); - -static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, - struct sock *sk) +static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp) { int err; fp->bpf_func = NULL; - fp->jited = 0; + fp->jited = false; - err = sk_chk_filter(fp->insns, fp->len); + err = bpf_check_classic(fp->insns, fp->len); if (err) { - if (sk != NULL) - sk_filter_uncharge(sk, fp); - else - kfree(fp); + __bpf_prog_release(fp); return ERR_PTR(err); } @@ -1517,13 +959,13 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, * internal BPF translation for the optimized interpreter. */ if (!fp->jited) - fp = __sk_migrate_filter(fp, sk); + fp = bpf_migrate_filter(fp); return fp; } /** - * sk_unattached_filter_create - create an unattached filter + * bpf_prog_create - create an unattached filter * @pfp: the unattached filter that is created * @fprog: the filter program * @@ -1532,23 +974,21 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, * If an error occurs or there is insufficient memory for the filter * a negative errno code is returned. On success the return is zero. */ -int sk_unattached_filter_create(struct sk_filter **pfp, - struct sock_fprog_kern *fprog) +int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) { - unsigned int fsize = sk_filter_proglen(fprog); - struct sk_filter *fp; + unsigned int fsize = bpf_classic_proglen(fprog); + struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; memcpy(fp->insns, fprog->filter, fsize); - atomic_set(&fp->refcnt, 1); fp->len = fprog->len; /* Since unattached filters are not copied back to user * space through sk_get_filter(), we do not need to hold @@ -1556,23 +996,23 @@ int sk_unattached_filter_create(struct sk_filter **pfp, */ fp->orig_prog = NULL; - /* __sk_prepare_filter() already takes care of uncharging + /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = __sk_prepare_filter(fp, NULL); + fp = bpf_prepare_filter(fp); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } -EXPORT_SYMBOL_GPL(sk_unattached_filter_create); +EXPORT_SYMBOL_GPL(bpf_prog_create); -void sk_unattached_filter_destroy(struct sk_filter *fp) +void bpf_prog_destroy(struct bpf_prog *fp) { - sk_filter_release(fp); + __bpf_prog_release(fp); } -EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy); +EXPORT_SYMBOL_GPL(bpf_prog_destroy); /** * sk_attach_filter - attach a socket filter @@ -1587,8 +1027,9 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct sk_filter *fp, *old_fp; - unsigned int fsize = sk_filter_proglen(fprog); - unsigned int sk_fsize = sk_filter_size(fprog->len); + unsigned int fsize = bpf_classic_proglen(fprog); + unsigned int bpf_fsize = bpf_prog_size(fprog->len); + struct bpf_prog *prog; int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) @@ -1598,30 +1039,43 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL); - if (!fp) + prog = bpf_prog_alloc(bpf_fsize, 0); + if (!prog) return -ENOMEM; - if (copy_from_user(fp->insns, fprog->filter, fsize)) { - sock_kfree_s(sk, fp, sk_fsize); + if (copy_from_user(prog->insns, fprog->filter, fsize)) { + __bpf_prog_free(prog); return -EFAULT; } - atomic_set(&fp->refcnt, 1); - fp->len = fprog->len; + prog->len = fprog->len; - err = sk_store_orig_filter(fp, fprog); + err = bpf_prog_store_orig_filter(prog, fprog); if (err) { - sk_filter_uncharge(sk, fp); + __bpf_prog_free(prog); return -ENOMEM; } - /* __sk_prepare_filter() already takes care of uncharging + /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - fp = __sk_prepare_filter(fp, sk); - if (IS_ERR(fp)) - return PTR_ERR(fp); + prog = bpf_prepare_filter(prog); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) { + __bpf_prog_release(prog); + return -ENOMEM; + } + fp->prog = prog; + + atomic_set(&fp->refcnt, 0); + + if (!sk_filter_charge(sk, fp)) { + __sk_filter_release(fp); + return -ENOMEM; + } old_fp = rcu_dereference_protected(sk->sk_filter, sock_owned_by_user(sk)); @@ -1670,7 +1124,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, /* We're copying the filter that has been originally attached, * so no conversion/decode needed anymore. */ - fprog = filter->orig_prog; + fprog = filter->prog->orig_prog; ret = fprog->len; if (!len) @@ -1682,7 +1136,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, goto out; ret = -EFAULT; - if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog))) + if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 107ed12a5323..45084938c403 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -13,6 +13,7 @@ #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <net/flow_keys.h> +#include <scsi/fc/fc_fcoe.h> /* copy saddr & daddr, possibly using 64bit load/store * Equivalent to : flow->src = iph->saddr; @@ -26,36 +27,61 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i } /** - * skb_flow_get_ports - extract the upper layer ports and return them - * @skb: buffer to extract the ports from + * __skb_flow_get_ports - extract the upper layer ports and return them + * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ -__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen) { int poff = proto_ports_offset(ip_proto); + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + if (poff >= 0) { __be32 *ports, _ports; - ports = skb_header_pointer(skb, thoff + poff, - sizeof(_ports), &_ports); + ports = __skb_header_pointer(skb, thoff + poff, + sizeof(_ports), data, hlen, &_ports); if (ports) return *ports; } return 0; } -EXPORT_SYMBOL(skb_flow_get_ports); +EXPORT_SYMBOL(__skb_flow_get_ports); -bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +/** + * __skb_flow_dissect - extract the flow_keys struct and return it + * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol + * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * + * The function will try to retrieve the struct flow_keys from either the skbuff + * or a raw buffer specified by the rest parameters + */ +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, + void *data, __be16 proto, int nhoff, int hlen) { - int nhoff = skb_network_offset(skb); u8 ip_proto; - __be16 proto = skb->protocol; + + if (!data) { + data = skb->data; + proto = skb->protocol; + nhoff = skb_network_offset(skb); + hlen = skb_headlen(skb); + } memset(flow, 0, sizeof(*flow)); @@ -65,7 +91,7 @@ again: const struct iphdr *iph; struct iphdr _iph; ip: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) return false; nhoff += iph->ihl * 4; @@ -74,21 +100,50 @@ ip: if (ip_is_fragment(iph)) ip_proto = 0; + /* skip the address processing if skb is NULL. The assumption + * here is that if there is no skb we are not looking for flow + * info but lengths and protocols. + */ + if (!skb) + break; + iph_to_flow_copy_addrs(flow, iph); break; } case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; + __be32 flow_label; + ipv6: - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) return false; ip_proto = iph->nexthdr; + nhoff += sizeof(struct ipv6hdr); + + /* see comment above in IPv4 section */ + if (!skb) + break; + flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); - nhoff += sizeof(struct ipv6hdr); + + flow_label = ip6_flowlabel(iph); + if (flow_label) { + /* Awesome, IPv6 packet has a flow label so we can + * use that to represent the ports without any + * further dissection. + */ + flow->n_proto = proto; + flow->ip_proto = ip_proto; + flow->ports = flow_label; + flow->thoff = (u16)nhoff; + + return true; + } + break; } case htons(ETH_P_8021AD): @@ -96,7 +151,7 @@ ipv6: const struct vlan_hdr *vlan; struct vlan_hdr _vlan; - vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) return false; @@ -109,7 +164,7 @@ ipv6: struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; proto = hdr->proto; @@ -123,6 +178,9 @@ ipv6: return false; } } + case htons(ETH_P_FCOE): + flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + /* fall through */ default: return false; } @@ -134,7 +192,7 @@ ipv6: __be16 proto; } *hdr, _hdr; - hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; /* @@ -154,8 +212,9 @@ ipv6: const struct ethhdr *eth; struct ethhdr _eth; - eth = skb_header_pointer(skb, nhoff, - sizeof(_eth), &_eth); + eth = __skb_header_pointer(skb, nhoff, + sizeof(_eth), + data, hlen, &_eth); if (!eth) return false; proto = eth->h_proto; @@ -175,13 +234,18 @@ ipv6: break; } + flow->n_proto = proto; flow->ip_proto = ip_proto; - flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); flow->thoff = (u16) nhoff; + /* unless skb is set we don't need to record port info */ + if (skb) + flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, + data, hlen); + return true; } -EXPORT_SYMBOL(skb_flow_dissect); +EXPORT_SYMBOL(__skb_flow_dissect); static u32 hashrnd __read_mostly; static __always_inline void __flow_hash_secret_init(void) @@ -195,11 +259,32 @@ static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) return jhash_3words(a, b, c, hashrnd); } -static __always_inline u32 __flow_hash_1word(u32 a) +static inline u32 __flow_hash_from_keys(struct flow_keys *keys) { - __flow_hash_secret_init(); - return jhash_1word(a, hashrnd); + u32 hash; + + /* get a consistent hash (same value on both flow directions) */ + if (((__force u32)keys->dst < (__force u32)keys->src) || + (((__force u32)keys->dst == (__force u32)keys->src) && + ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) { + swap(keys->dst, keys->src); + swap(keys->port16[0], keys->port16[1]); + } + + hash = __flow_hash_3words((__force u32)keys->dst, + (__force u32)keys->src, + (__force u32)keys->ports); + if (!hash) + hash = 1; + + return hash; +} + +u32 flow_hash_from_keys(struct flow_keys *keys) +{ + return __flow_hash_from_keys(keys); } +EXPORT_SYMBOL(flow_hash_from_keys); /* * __skb_get_hash: calculate a flow hash based on src/dst addresses @@ -210,7 +295,6 @@ static __always_inline u32 __flow_hash_1word(u32 a) void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; - u32 hash; if (!skb_flow_dissect(skb, &keys)) return; @@ -218,21 +302,9 @@ void __skb_get_hash(struct sk_buff *skb) if (keys.ports) skb->l4_hash = 1; - /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys.dst < (__force u32)keys.src) || - (((__force u32)keys.dst == (__force u32)keys.src) && - ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { - swap(keys.dst, keys.src); - swap(keys.port16[0], keys.port16[1]); - } - - hash = __flow_hash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports); - if (!hash) - hash = 1; + skb->sw_hash = 1; - skb->hash = hash; + skb->hash = __flow_hash_from_keys(&keys); } EXPORT_SYMBOL(__skb_get_hash); @@ -240,7 +312,7 @@ EXPORT_SYMBOL(__skb_get_hash); * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ -u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, +u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, unsigned int num_tx_queues) { u32 hash; @@ -260,40 +332,27 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, qcount = dev->tc_to_txq[tc].count; } - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol; - hash = __flow_hash_1word(hash); - - return (u16) (((u64) hash * qcount) >> 32) + qoffset; + return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } EXPORT_SYMBOL(__skb_tx_hash); -/* __skb_get_poff() returns the offset to the payload as far as it could - * be dissected. The main user is currently BPF, so that we can dynamically - * truncate packets without needing to push actual payload to the user - * space and can analyze headers only, instead. - */ -u32 __skb_get_poff(const struct sk_buff *skb) +u32 __skb_get_poff(const struct sk_buff *skb, void *data, + const struct flow_keys *keys, int hlen) { - struct flow_keys keys; - u32 poff = 0; + u32 poff = keys->thoff; - if (!skb_flow_dissect(skb, &keys)) - return 0; - - poff += keys.thoff; - switch (keys.ip_proto) { + switch (keys->ip_proto) { case IPPROTO_TCP: { - const struct tcphdr *tcph; - struct tcphdr _tcph; + /* access doff as u8 to avoid unaligned access */ + const u8 *doff; + u8 _doff; - tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph); - if (!tcph) + doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff), + data, hlen, &_doff); + if (!doff) return poff; - poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4); + poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2); break; } case IPPROTO_UDP: @@ -323,6 +382,21 @@ u32 __skb_get_poff(const struct sk_buff *skb) return poff; } +/* skb_get_poff() returns the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 skb_get_poff(const struct sk_buff *skb) +{ + struct flow_keys keys; + + if (!skb_flow_dissect(skb, &keys)) + return 0; + + return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); +} + static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS @@ -338,17 +412,9 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) if (map) { if (map->len == 1) queue_index = map->queues[0]; - else { - u32 hash; - if (skb->sk && skb->sk->sk_hash) - hash = skb->sk->sk_hash; - else - hash = (__force u16) skb->protocol ^ - skb->hash; - hash = __flow_hash_1word(hash); - queue_index = map->queues[ - ((u64)hash * map->len) >> 32]; - } + else + queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), + map->len)]; if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 6b5b6e7013ca..9dfb88a933e7 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -91,6 +91,8 @@ struct gen_estimator u32 avpps; struct rcu_head e_rcu; struct rb_node node; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct rcu_head head; }; struct gen_estimator_head @@ -115,9 +117,8 @@ static void est_timer(unsigned long arg) rcu_read_lock(); list_for_each_entry_rcu(e, &elist[idx].list, list) { - u64 nbytes; + struct gnet_stats_basic_packed b = {0}; u64 brate; - u32 npackets; u32 rate; spin_lock(e->stats_lock); @@ -125,15 +126,15 @@ static void est_timer(unsigned long arg) if (e->bstats == NULL) goto skip; - nbytes = e->bstats->bytes; - npackets = e->bstats->packets; - brate = (nbytes - e->last_bytes)<<(7 - idx); - e->last_bytes = nbytes; + __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats); + + brate = (b.bytes - e->last_bytes)<<(7 - idx); + e->last_bytes = b.bytes; e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); e->rate_est->bps = (e->avbps+0xF)>>5; - rate = (npackets - e->last_packets)<<(12 - idx); - e->last_packets = npackets; + rate = (b.packets - e->last_packets)<<(12 - idx); + e->last_packets = b.packets; e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); e->rate_est->pps = (e->avpps+0x1FF)>>10; skip: @@ -197,18 +198,20 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * as destination. A new timer with the interval specified in the * configuration TLV is created. Upon each interval, the latest statistics * will be read from &bstats and the estimated rate will be stored in - * &rate_est with the statistics lock grabed during this period. + * &rate_est with the statistics lock grabbed during this period. * * Returns 0 on success or a negative error code. * */ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { struct gen_estimator *est; struct gnet_estimator *parm = nla_data(opt); + struct gnet_stats_basic_packed b = {0}; int idx; if (nla_len(opt) < sizeof(*parm)) @@ -221,15 +224,18 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (est == NULL) return -ENOBUFS; + __gnet_stats_copy_basic(&b, cpu_bstats, bstats); + idx = parm->interval + 2; est->bstats = bstats; est->rate_est = rate_est; est->stats_lock = stats_lock; est->ewma_log = parm->ewma_log; - est->last_bytes = bstats->bytes; + est->last_bytes = b.bytes; est->avbps = rate_est->bps<<5; - est->last_packets = bstats->packets; + est->last_packets = b.packets; est->avpps = rate_est->pps<<10; + est->cpu_bstats = cpu_bstats; spin_lock_bh(&est_tree_lock); if (!elist[idx].timer.function) { @@ -290,11 +296,12 @@ EXPORT_SYMBOL(gen_kill_estimator); * Returns 0 on success or a negative error code. */ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { gen_kill_estimator(bstats, rate_est); - return gen_new_estimator(bstats, rate_est, stats_lock, opt); + return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt); } EXPORT_SYMBOL(gen_replace_estimator); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 9d3d9e78397b..0c08062d1796 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -97,6 +97,43 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, } EXPORT_SYMBOL(gnet_stats_start_copy); +static void +__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu) +{ + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes; + u32 packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); + bytes = bcpu->bstats.bytes; + packets = bcpu->bstats.packets; + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + + bstats->bytes += bytes; + bstats->packets += packets; + } +} + +void +__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +{ + if (cpu) { + __gnet_stats_copy_basic_cpu(bstats, cpu); + } else { + bstats->bytes = b->bytes; + bstats->packets = b->packets; + } +} +EXPORT_SYMBOL(__gnet_stats_copy_basic); + /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV * @d: dumping handle @@ -109,19 +146,25 @@ EXPORT_SYMBOL(gnet_stats_start_copy); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) { + struct gnet_stats_basic_packed bstats = {0}; + + __gnet_stats_copy_basic(&bstats, cpu, b); + if (d->compat_tc_stats) { - d->tc_stats.bytes = b->bytes; - d->tc_stats.packets = b->packets; + d->tc_stats.bytes = bstats.bytes; + d->tc_stats.packets = bstats.packets; } if (d->tail) { struct gnet_stats_basic sb; memset(&sb, 0, sizeof(sb)); - sb.bytes = b->bytes; - sb.packets = b->packets; + sb.bytes = bstats.bytes; + sb.packets = bstats.packets; return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); } return 0; @@ -172,29 +215,74 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, } EXPORT_SYMBOL(gnet_stats_copy_rate_est); +static void +__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *q) +{ + int i; + + for_each_possible_cpu(i) { + const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); + + qstats->qlen = 0; + qstats->backlog += qcpu->backlog; + qstats->drops += qcpu->drops; + qstats->requeues += qcpu->requeues; + qstats->overlimits += qcpu->overlimits; + } +} + +static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q, + __u32 qlen) +{ + if (cpu) { + __gnet_stats_copy_queue_cpu(qstats, cpu); + } else { + qstats->qlen = q->qlen; + qstats->backlog = q->backlog; + qstats->drops = q->drops; + qstats->requeues = q->requeues; + qstats->overlimits = q->overlimits; + } + + qstats->qlen = qlen; +} + /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV * @d: dumping handle + * @cpu_q: per cpu queue statistics * @q: queue statistics + * @qlen: queue length statistics * * Appends the queue statistics to the top level TLV created by - * gnet_stats_start_copy(). + * gnet_stats_start_copy(). Using per cpu queue statistics if + * they are available. * * Returns 0 on success or -1 with the statistic lock released * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) +gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue __percpu *cpu_q, + struct gnet_stats_queue *q, __u32 qlen) { + struct gnet_stats_queue qstats = {0}; + + __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen); + if (d->compat_tc_stats) { - d->tc_stats.drops = q->drops; - d->tc_stats.qlen = q->qlen; - d->tc_stats.backlog = q->backlog; - d->tc_stats.overlimits = q->overlimits; + d->tc_stats.drops = qstats.drops; + d->tc_stats.qlen = qstats.qlen; + d->tc_stats.backlog = qstats.backlog; + d->tc_stats.overlimits = qstats.overlimits; } if (d->tail) - return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q)); + return gnet_stats_copy(d, TCA_STATS_QUEUE, + &qstats, sizeof(qstats)); return 0; } @@ -206,7 +294,7 @@ EXPORT_SYMBOL(gnet_stats_copy_queue); * @st: application specific statistics data * @len: length of data * - * Appends the application sepecific statistics to the top level TLV created by + * Appends the application specific statistics to the top level TLV created by * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping * handle is in backward compatibility mode. * diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1cac29ebb05b..9dd06699b09c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -43,12 +43,12 @@ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, ssize_t (*format)(const struct net_device *, char *)) { - struct net_device *net = to_net_dev(dev); + struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; read_lock(&dev_base_lock); - if (dev_isalive(net)) - ret = (*format)(net, buf); + if (dev_isalive(ndev)) + ret = (*format)(ndev, buf); read_unlock(&dev_base_lock); return ret; @@ -56,9 +56,9 @@ static ssize_t netdev_show(const struct device *dev, /* generate a show function for simple field */ #define NETDEVICE_SHOW(field, format_string) \ -static ssize_t format_##field(const struct net_device *net, char *buf) \ +static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sprintf(buf, format_string, net->field); \ + return sprintf(buf, format_string, dev->field); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -112,16 +112,35 @@ NETDEVICE_SHOW_RO(ifindex, fmt_dec); NETDEVICE_SHOW_RO(type, fmt_dec); NETDEVICE_SHOW_RO(link_mode, fmt_dec); +static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) +{ + return sprintf(buf, fmt_dec, dev->name_assign_type); +} + +static ssize_t name_assign_type_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (ndev->name_assign_type != NET_NAME_UNKNOWN) + ret = netdev_show(dev, attr, buf, format_name_assign_type); + + return ret; +} +static DEVICE_ATTR_RO(name_assign_type); + /* use same locking rules as GIFHWADDR ioctl's */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct net_device *net = to_net_dev(dev); + struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; read_lock(&dev_base_lock); - if (dev_isalive(net)) - ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len); + if (dev_isalive(ndev)) + ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); read_unlock(&dev_base_lock); return ret; } @@ -130,18 +149,18 @@ static DEVICE_ATTR_RO(address); static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct net_device *net = to_net_dev(dev); - if (dev_isalive(net)) - return sysfs_format_mac(buf, net->broadcast, net->addr_len); + struct net_device *ndev = to_net_dev(dev); + if (dev_isalive(ndev)) + return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); return -EINVAL; } static DEVICE_ATTR_RO(broadcast); -static int change_carrier(struct net_device *net, unsigned long new_carrier) +static int change_carrier(struct net_device *dev, unsigned long new_carrier) { - if (!netif_running(net)) + if (!netif_running(dev)) return -EINVAL; - return dev_change_carrier(net, (bool) new_carrier); + return dev_change_carrier(dev, (bool) new_carrier); } static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, @@ -265,9 +284,9 @@ static DEVICE_ATTR_RO(carrier_changes); /* read-write attributes */ -static int change_mtu(struct net_device *net, unsigned long new_mtu) +static int change_mtu(struct net_device *dev, unsigned long new_mtu) { - return dev_set_mtu(net, (int) new_mtu); + return dev_set_mtu(dev, (int) new_mtu); } static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, @@ -277,9 +296,9 @@ static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, } NETDEVICE_SHOW_RW(mtu, fmt_dec); -static int change_flags(struct net_device *net, unsigned long new_flags) +static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(net, (unsigned int) new_flags); + return dev_change_flags(dev, (unsigned int) new_flags); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, @@ -289,9 +308,9 @@ static ssize_t flags_store(struct device *dev, struct device_attribute *attr, } NETDEVICE_SHOW_RW(flags, fmt_hex); -static int change_tx_queue_len(struct net_device *net, unsigned long new_len) +static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) { - net->tx_queue_len = new_len; + dev->tx_queue_len = new_len; return 0; } @@ -344,9 +363,9 @@ static ssize_t ifalias_show(struct device *dev, } static DEVICE_ATTR_RW(ifalias); -static int change_group(struct net_device *net, unsigned long new_group) +static int change_group(struct net_device *dev, unsigned long new_group) { - dev_set_group(net, (int) new_group); + dev_set_group(dev, (int) new_group); return 0; } @@ -387,6 +406,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_dev_port.attr, &dev_attr_iflink.attr, &dev_attr_ifindex.attr, + &dev_attr_name_assign_type.attr, &dev_attr_addr_assign_type.attr, &dev_attr_addr_len.attr, &dev_attr_link_mode.attr, @@ -776,20 +796,20 @@ static struct kobj_type rx_queue_ktype = { .namespace = rx_queue_namespace }; -static int rx_queue_add_kobject(struct net_device *net, int index) +static int rx_queue_add_kobject(struct net_device *dev, int index) { - struct netdev_rx_queue *queue = net->_rx + index; + struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error = 0; - kobj->kset = net->queues_kset; + kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); if (error) goto exit; - if (net->sysfs_rx_queue_group) { - error = sysfs_create_group(kobj, net->sysfs_rx_queue_group); + if (dev->sysfs_rx_queue_group) { + error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) goto exit; } @@ -805,18 +825,18 @@ exit: #endif /* CONFIG_SYSFS */ int -net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; #ifndef CONFIG_RPS - if (!net->sysfs_rx_queue_group) + if (!dev->sysfs_rx_queue_group) return 0; #endif for (i = old_num; i < new_num; i++) { - error = rx_queue_add_kobject(net, i); + error = rx_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; @@ -824,10 +844,10 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } while (--i >= new_num) { - if (net->sysfs_rx_queue_group) - sysfs_remove_group(&net->_rx[i].kobj, - net->sysfs_rx_queue_group); - kobject_put(&net->_rx[i].kobj); + if (dev->sysfs_rx_queue_group) + sysfs_remove_group(&dev->_rx[i].kobj, + dev->sysfs_rx_queue_group); + kobject_put(&dev->_rx[i].kobj); } return error; @@ -1135,13 +1155,13 @@ static struct kobj_type netdev_queue_ktype = { .namespace = netdev_queue_namespace, }; -static int netdev_queue_add_kobject(struct net_device *net, int index) +static int netdev_queue_add_kobject(struct net_device *dev, int index) { - struct netdev_queue *queue = net->_tx + index; + struct netdev_queue *queue = dev->_tx + index; struct kobject *kobj = &queue->kobj; int error = 0; - kobj->kset = net->queues_kset; + kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); if (error) @@ -1164,14 +1184,14 @@ exit: #endif /* CONFIG_SYSFS */ int -netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; for (i = old_num; i < new_num; i++) { - error = netdev_queue_add_kobject(net, i); + error = netdev_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; @@ -1179,7 +1199,7 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } while (--i >= new_num) { - struct netdev_queue *queue = net->_tx + i; + struct netdev_queue *queue = dev->_tx + i; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1193,25 +1213,25 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) #endif /* CONFIG_SYSFS */ } -static int register_queue_kobjects(struct net_device *net) +static int register_queue_kobjects(struct net_device *dev) { int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS - net->queues_kset = kset_create_and_add("queues", - NULL, &net->dev.kobj); - if (!net->queues_kset) + dev->queues_kset = kset_create_and_add("queues", + NULL, &dev->dev.kobj); + if (!dev->queues_kset) return -ENOMEM; - real_rx = net->real_num_rx_queues; + real_rx = dev->real_num_rx_queues; #endif - real_tx = net->real_num_tx_queues; + real_tx = dev->real_num_tx_queues; - error = net_rx_queue_update_kobjects(net, 0, real_rx); + error = net_rx_queue_update_kobjects(dev, 0, real_rx); if (error) goto error; rxq = real_rx; - error = netdev_queue_update_kobjects(net, 0, real_tx); + error = netdev_queue_update_kobjects(dev, 0, real_tx); if (error) goto error; txq = real_tx; @@ -1219,24 +1239,24 @@ static int register_queue_kobjects(struct net_device *net) return 0; error: - netdev_queue_update_kobjects(net, txq, 0); - net_rx_queue_update_kobjects(net, rxq, 0); + netdev_queue_update_kobjects(dev, txq, 0); + net_rx_queue_update_kobjects(dev, rxq, 0); return error; } -static void remove_queue_kobjects(struct net_device *net) +static void remove_queue_kobjects(struct net_device *dev) { int real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS - real_rx = net->real_num_rx_queues; + real_rx = dev->real_num_rx_queues; #endif - real_tx = net->real_num_tx_queues; + real_tx = dev->real_num_tx_queues; - net_rx_queue_update_kobjects(net, real_rx, 0); - netdev_queue_update_kobjects(net, real_tx, 0); + net_rx_queue_update_kobjects(dev, real_rx, 0); + netdev_queue_update_kobjects(dev, real_tx, 0); #ifdef CONFIG_SYSFS - kset_unregister(net->queues_kset); + kset_unregister(dev->queues_kset); #endif } @@ -1329,13 +1349,13 @@ static struct class net_class = { /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. */ -void netdev_unregister_kobject(struct net_device * net) +void netdev_unregister_kobject(struct net_device *ndev) { - struct device *dev = &(net->dev); + struct device *dev = &(ndev->dev); kobject_get(&dev->kobj); - remove_queue_kobjects(net); + remove_queue_kobjects(ndev); pm_runtime_set_memalloc_noio(dev, false); @@ -1343,18 +1363,18 @@ void netdev_unregister_kobject(struct net_device * net) } /* Create sysfs entries for network device. */ -int netdev_register_kobject(struct net_device *net) +int netdev_register_kobject(struct net_device *ndev) { - struct device *dev = &(net->dev); - const struct attribute_group **groups = net->sysfs_groups; + struct device *dev = &(ndev->dev); + const struct attribute_group **groups = ndev->sysfs_groups; int error = 0; device_initialize(dev); dev->class = &net_class; - dev->platform_data = net; + dev->platform_data = ndev; dev->groups = groups; - dev_set_name(dev, "%s", net->name); + dev_set_name(dev, "%s", ndev->name); #ifdef CONFIG_SYSFS /* Allow for a device specific group */ @@ -1364,10 +1384,10 @@ int netdev_register_kobject(struct net_device *net) *groups++ = &netstat_group; #if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) - if (net->ieee80211_ptr) + if (ndev->ieee80211_ptr) *groups++ = &wireless_group; #if IS_ENABLED(CONFIG_WIRELESS_EXT) - else if (net->wireless_handlers) + else if (ndev->wireless_handlers) *groups++ = &wireless_group; #endif #endif @@ -1377,7 +1397,7 @@ int netdev_register_kobject(struct net_device *net) if (error) return error; - error = register_queue_kobjects(net); + error = register_queue_kobjects(ndev); if (error) { device_del(dev); return error; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 85b62691f4f2..7f155175bba8 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -224,7 +224,7 @@ static void net_free(struct net *net) return; } #endif - kfree(net->gen); + kfree(rcu_access_pointer(net->gen)); kmem_cache_free(net_cachep, net); } @@ -373,9 +373,11 @@ struct net *get_net_ns_by_pid(pid_t pid) tsk = find_task_by_vpid(pid); if (tsk) { struct nsproxy *nsproxy; - nsproxy = task_nsproxy(tsk); + task_lock(tsk); + nsproxy = tsk->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); + task_unlock(tsk); } rcu_read_unlock(); return net; @@ -632,11 +634,11 @@ static void *netns_get(struct task_struct *task) struct net *net = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); - rcu_read_unlock(); + task_unlock(task); return net; } diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 30d903b19c62..1f2a126f4ffa 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -107,5 +107,5 @@ struct cgroup_subsys net_cls_cgrp_subsys = { .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = cgrp_attach, - .base_cftypes = ss_files, + .legacy_cftypes = ss_files, }; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e33937fb32a0..e6645b4f330a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -72,7 +72,6 @@ module_param(carrier_timeout, uint, 0644); static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { - const struct net_device_ops *ops = dev->netdev_ops; int status = NETDEV_TX_OK; netdev_features_t features; @@ -92,9 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } - status = ops->ndo_start_xmit(skb, dev); - if (status == NETDEV_TX_OK) - txq_trans_update(txq); + status = netdev_start_xmit(skb, dev, txq, false); out: return status; @@ -116,7 +113,7 @@ static void queue_process(struct work_struct *work) continue; } - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(dev, skb); local_irq_save(flags); HARD_TX_LOCK(dev, txq, smp_processor_id()); @@ -822,7 +819,8 @@ void __netpoll_cleanup(struct netpoll *np) RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); - } + } else + RCU_INIT_POINTER(np->dev->npinfo, NULL); } EXPORT_SYMBOL_GPL(__netpoll_cleanup); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 2f385b9bccc0..cbd0a199bf52 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -249,7 +249,7 @@ struct cgroup_subsys net_prio_cgrp_subsys = { .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = net_prio_attach, - .base_cftypes = ss_files, + .legacy_cftypes = ss_files, }; static int netprio_device_event(struct notifier_block *unused, diff --git a/net/core/pktgen.c b/net/core/pktgen.c index fc17a9d309ac..443256bdcddc 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -69,8 +69,9 @@ * for running devices in the if_list and sends packets until count is 0 it * also the thread checks the thread->control which is used for inter-process * communication. controlling process "posts" operations to the threads this - * way. The if_lock should be possible to remove when add/rem_device is merged - * into this too. + * way. + * The if_list is RCU protected, and the if_lock remains to protect updating + * of if_list, from "add_device" as it invoked from userspace (via proc write). * * By design there should only be *one* "controlling" process. In practice * multiple write accesses gives unpredictable result. Understood by "write" @@ -201,6 +202,7 @@ #define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ #define F_NODE (1<<15) /* Node memory alloc*/ #define F_UDPCSUM (1<<16) /* Include UDP checksum */ +#define F_NO_TIMESTAMP (1<<17) /* Don't timestamp packets (default TS) */ /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -208,7 +210,7 @@ #define T_REMDEVALL (1<<2) /* Remove all devs */ #define T_REMDEV (1<<3) /* Remove one dev */ -/* If lock -- can be removed after some work */ +/* If lock -- protects updating of if_list */ #define if_lock(t) spin_lock(&(t->if_lock)); #define if_unlock(t) spin_unlock(&(t->if_lock)); @@ -241,6 +243,7 @@ struct pktgen_dev { struct proc_dir_entry *entry; /* proc file */ struct pktgen_thread *pg_thread;/* the owner */ struct list_head list; /* chaining in the thread's run-queue */ + struct rcu_head rcu; /* freed by RCU */ int running; /* if false, the test will stop */ @@ -384,6 +387,7 @@ struct pktgen_dev { u16 queue_map_min; u16 queue_map_max; __u32 skb_priority; /* skb priority field */ + unsigned int burst; /* number of duplicated packets to burst */ int node; /* Memory node */ #ifdef CONFIG_XFRM @@ -503,7 +507,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, pktgen_reset_all_threads(pn); else - pr_warning("Unknown command: %s\n", data); + pr_warn("Unknown command: %s\n", data); return count; } @@ -610,6 +614,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->traffic_class) seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class); + if (pkt_dev->burst > 1) + seq_printf(seq, " burst: %d\n", pkt_dev->burst); + if (pkt_dev->node >= 0) seq_printf(seq, " node: %d\n", pkt_dev->node); @@ -636,6 +643,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev->flags & F_UDPCSUM) seq_puts(seq, "UDPCSUM "); + if (pkt_dev->flags & F_NO_TIMESTAMP) + seq_puts(seq, "NO_TIMESTAMP "); + if (pkt_dev->flags & F_MPLS_RND) seq_puts(seq, "MPLS_RND "); @@ -802,7 +812,6 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) case '\t': case ' ': goto done_str; - break; default: break; } @@ -856,14 +865,14 @@ static ssize_t pktgen_if_write(struct file *file, pg_result = &(pkt_dev->result[0]); if (count < 1) { - pr_warning("wrong command format\n"); + pr_warn("wrong command format\n"); return -EINVAL; } max = count; tmp = count_trail_chars(user_buffer, max); if (tmp < 0) { - pr_warning("illegal format\n"); + pr_warn("illegal format\n"); return tmp; } i = tmp; @@ -1119,6 +1128,16 @@ static ssize_t pktgen_if_write(struct file *file, pkt_dev->dst_mac_count); return count; } + if (!strcmp(name, "burst")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->burst = value < 1 ? 1 : value; + sprintf(pg_result, "OK: burst=%d", pkt_dev->burst); + return count; + } if (!strcmp(name, "node")) { len = num_arg(&user_buffer[i], 10, &value); if (len < 0) @@ -1242,6 +1261,9 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, "!UDPCSUM") == 0) pkt_dev->flags &= ~F_UDPCSUM; + else if (strcmp(f, "NO_TIMESTAMP") == 0) + pkt_dev->flags |= F_NO_TIMESTAMP; + else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", @@ -1250,6 +1272,7 @@ static ssize_t pktgen_if_write(struct file *file, "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, " "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, " "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, " + "NO_TIMESTAMP, " #ifdef CONFIG_XFRM "IPSEC, " #endif @@ -1737,14 +1760,14 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) seq_puts(seq, "Running: "); - if_lock(t); - list_for_each_entry(pkt_dev, &t->if_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) if (pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odevname); seq_puts(seq, "\nStopped: "); - list_for_each_entry(pkt_dev, &t->if_list, list) + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) if (!pkt_dev->running) seq_printf(seq, "%s ", pkt_dev->odevname); @@ -1753,7 +1776,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) else seq_puts(seq, "\nResult: NA\n"); - if_unlock(t); + rcu_read_unlock(); return 0; } @@ -1878,10 +1901,8 @@ static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn, pkt_dev = pktgen_find_dev(t, ifname, exact); if (pkt_dev) { if (remove) { - if_lock(t); pkt_dev->removal_mark = 1; t->control |= T_REMDEV; - if_unlock(t); } break; } @@ -1931,7 +1952,8 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d list_for_each_entry(t, &pn->pktgen_threads, th_list) { struct pktgen_dev *pkt_dev; - list_for_each_entry(pkt_dev, &t->if_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { if (pkt_dev->odev != dev) continue; @@ -1946,6 +1968,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d dev->name); break; } + rcu_read_unlock(); } } @@ -2047,15 +2070,15 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) ntxq = pkt_dev->odev->real_num_tx_queues; if (ntxq <= pkt_dev->queue_map_min) { - pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", - pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, - pkt_dev->odevname); + pr_warn("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); pkt_dev->queue_map_min = (ntxq ?: 1) - 1; } if (pkt_dev->queue_map_max >= ntxq) { - pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", - pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, - pkt_dev->odevname); + pr_warn("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); pkt_dev->queue_map_max = (ntxq ?: 1) - 1; } @@ -2684,9 +2707,14 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, pgh->pgh_magic = htonl(PKTGEN_MAGIC); pgh->seq_num = htonl(pkt_dev->seq_num); - do_gettimeofday(×tamp); - pgh->tv_sec = htonl(timestamp.tv_sec); - pgh->tv_usec = htonl(timestamp.tv_usec); + if (pkt_dev->flags & F_NO_TIMESTAMP) { + pgh->tv_sec = 0; + pgh->tv_usec = 0; + } else { + do_gettimeofday(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_usec); + } } static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, @@ -2997,8 +3025,8 @@ static void pktgen_run(struct pktgen_thread *t) func_enter(); - if_lock(t); - list_for_each_entry(pkt_dev, &t->if_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { /* * setup odev and create initial packet. @@ -3007,18 +3035,18 @@ static void pktgen_run(struct pktgen_thread *t) if (pkt_dev->odev) { pktgen_clear_counters(pkt_dev); - pkt_dev->running = 1; /* Cranke yeself! */ pkt_dev->skb = NULL; pkt_dev->started_at = pkt_dev->next_tx = ktime_get(); set_pkt_overhead(pkt_dev); strcpy(pkt_dev->result, "Starting"); + pkt_dev->running = 1; /* Cranke yeself! */ started++; } else strcpy(pkt_dev->result, "Error starting"); } - if_unlock(t); + rcu_read_unlock(); if (started) t->control &= ~(T_STOP); } @@ -3041,27 +3069,25 @@ static int thread_is_running(const struct pktgen_thread *t) { const struct pktgen_dev *pkt_dev; - list_for_each_entry(pkt_dev, &t->if_list, list) - if (pkt_dev->running) + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) + if (pkt_dev->running) { + rcu_read_unlock(); return 1; + } + rcu_read_unlock(); return 0; } static int pktgen_wait_thread_run(struct pktgen_thread *t) { - if_lock(t); - while (thread_is_running(t)) { - if_unlock(t); - msleep_interruptible(100); if (signal_pending(current)) goto signal; - if_lock(t); } - if_unlock(t); return 1; signal: return 0; @@ -3161,15 +3187,15 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev) int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1; if (!pkt_dev->running) { - pr_warning("interface: %s is already stopped\n", - pkt_dev->odevname); + pr_warn("interface: %s is already stopped\n", + pkt_dev->odevname); return -EINVAL; } + pkt_dev->running = 0; kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; pkt_dev->stopped_at = ktime_get(); - pkt_dev->running = 0; show_results(pkt_dev, nr_frags); @@ -3180,9 +3206,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t) { struct pktgen_dev *pkt_dev, *best = NULL; - if_lock(t); - - list_for_each_entry(pkt_dev, &t->if_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { if (!pkt_dev->running) continue; if (best == NULL) @@ -3190,7 +3215,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t) else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0) best = pkt_dev; } - if_unlock(t); + rcu_read_unlock(); + return best; } @@ -3200,13 +3226,13 @@ static void pktgen_stop(struct pktgen_thread *t) func_enter(); - if_lock(t); + rcu_read_lock(); - list_for_each_entry(pkt_dev, &t->if_list, list) { + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { pktgen_stop_device(pkt_dev); } - if_unlock(t); + rcu_read_unlock(); } /* @@ -3220,8 +3246,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) func_enter(); - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3235,8 +3259,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t) break; } - - if_unlock(t); } static void pktgen_rem_all_ifs(struct pktgen_thread *t) @@ -3248,8 +3270,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) /* Remove all devices, free mem */ - if_lock(t); - list_for_each_safe(q, n, &t->if_list) { cur = list_entry(q, struct pktgen_dev, list); @@ -3258,8 +3278,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t) pktgen_remove_device(t, cur); } - - if_unlock(t); } static void pktgen_rem_thread(struct pktgen_thread *t) @@ -3293,11 +3311,9 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) static void pktgen_xmit(struct pktgen_dev *pkt_dev) { + unsigned int burst = ACCESS_ONCE(pkt_dev->burst); struct net_device *odev = pkt_dev->odev; - netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *) - = odev->netdev_ops->ndo_start_xmit; struct netdev_queue *txq; - u16 queue_map; int ret; /* If device is offline, then don't send */ @@ -3335,8 +3351,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) if (pkt_dev->delay && pkt_dev->last_ok) spin(pkt_dev, pkt_dev->next_tx); - queue_map = skb_get_queue_mapping(pkt_dev->skb); - txq = netdev_get_tx_queue(odev, queue_map); + txq = skb_get_tx_queue(odev, pkt_dev->skb); local_bh_disable(); @@ -3347,16 +3362,19 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->last_ok = 0; goto unlock; } - atomic_inc(&(pkt_dev->skb->users)); - ret = (*xmit)(pkt_dev->skb, odev); + atomic_add(burst, &pkt_dev->skb->users); + +xmit_more: + ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0); switch (ret) { case NETDEV_TX_OK: - txq_trans_update(txq); pkt_dev->last_ok = 1; pkt_dev->sofar++; pkt_dev->seq_num++; pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + if (burst > 0 && !netif_xmit_frozen_or_drv_stopped(txq)) + goto xmit_more; break; case NET_XMIT_DROP: case NET_XMIT_CN: @@ -3375,6 +3393,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) atomic_dec(&(pkt_dev->skb->users)); pkt_dev->last_ok = 0; } + if (unlikely(burst)) + atomic_sub(burst, &pkt_dev->skb->users); unlock: HARD_TX_UNLOCK(odev, txq); @@ -3407,10 +3427,10 @@ static int pktgen_thread_worker(void *arg) pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current)); - set_current_state(TASK_INTERRUPTIBLE); - set_freezable(); + __set_current_state(TASK_RUNNING); + while (!kthread_should_stop()) { pkt_dev = next_to_run(t); @@ -3424,8 +3444,6 @@ static int pktgen_thread_worker(void *arg) continue; } - __set_current_state(TASK_RUNNING); - if (likely(pkt_dev)) { pktgen_xmit(pkt_dev); @@ -3456,9 +3474,8 @@ static int pktgen_thread_worker(void *arg) } try_to_freeze(); - - set_current_state(TASK_INTERRUPTIBLE); } + set_current_state(TASK_INTERRUPTIBLE); pr_debug("%s stopping all device\n", t->tsk->comm); pktgen_stop(t); @@ -3485,8 +3502,8 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, struct pktgen_dev *p, *pkt_dev = NULL; size_t len = strlen(ifname); - if_lock(t); - list_for_each_entry(p, &t->if_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(p, &t->if_list, list) if (strncmp(p->odevname, ifname, len) == 0) { if (p->odevname[len]) { if (exact || p->odevname[len] != '@') @@ -3496,7 +3513,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, break; } - if_unlock(t); + rcu_read_unlock(); pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev); return pkt_dev; } @@ -3510,6 +3527,12 @@ static int add_dev_to_thread(struct pktgen_thread *t, { int rv = 0; + /* This function cannot be called concurrently, as its called + * under pktgen_thread_lock mutex, but it can run from + * userspace on another CPU than the kthread. The if_lock() + * is used here to sync with concurrent instances of + * _rem_dev_from_if_list() invoked via kthread, which is also + * updating the if_list */ if_lock(t); if (pkt_dev->pg_thread) { @@ -3518,9 +3541,9 @@ static int add_dev_to_thread(struct pktgen_thread *t, goto out; } - list_add(&pkt_dev->list, &t->if_list); - pkt_dev->pg_thread = t; pkt_dev->running = 0; + pkt_dev->pg_thread = t; + list_add_rcu(&pkt_dev->list, &t->if_list); out: if_unlock(t); @@ -3570,6 +3593,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->svlan_p = 0; pkt_dev->svlan_cfi = 0; pkt_dev->svlan_id = 0xffff; + pkt_dev->burst = 1; pkt_dev->node = -1; err = pktgen_setup_dev(t->net, pkt_dev, ifname); @@ -3675,11 +3699,13 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t, struct list_head *q, *n; struct pktgen_dev *p; + if_lock(t); list_for_each_safe(q, n, &t->if_list) { p = list_entry(q, struct pktgen_dev, list); if (p == pkt_dev) - list_del(&p->list); + list_del_rcu(&p->list); } + if_unlock(t); } static int pktgen_remove_device(struct pktgen_thread *t, @@ -3688,7 +3714,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, pr_debug("remove_device pkt_dev=%p\n", pkt_dev); if (pkt_dev->running) { - pr_warning("WARNING: trying to remove a running interface, stopping it now\n"); + pr_warn("WARNING: trying to remove a running interface, stopping it now\n"); pktgen_stop_device(pkt_dev); } @@ -3699,20 +3725,22 @@ static int pktgen_remove_device(struct pktgen_thread *t, pkt_dev->odev = NULL; } - /* And update the thread if_list */ - - _rem_dev_from_if_list(t, pkt_dev); - + /* Remove proc before if_list entry, because add_device uses + * list to determine if interface already exist, avoid race + * with proc_create_data() */ if (pkt_dev->entry) proc_remove(pkt_dev->entry); + /* And update the thread if_list */ + _rem_dev_from_if_list(t, pkt_dev); + #ifdef CONFIG_XFRM free_SAs(pkt_dev); #endif vfree(pkt_dev->flows); if (pkt_dev->page) put_page(pkt_dev->page); - kfree(pkt_dev); + kfree_rcu(pkt_dev, rcu); return 0; } @@ -3812,6 +3840,7 @@ static void __exit pg_cleanup(void) { unregister_netdevice_notifier(&pktgen_notifier_block); unregister_pernet_subsys(&pg_net_ops); + /* Don't need rcu_barrier() due to use of kfree_rcu() */ } module_init(pg_init); diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index d3027a73fd4b..4eab4a94a59d 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -52,14 +52,43 @@ * test_8021q: * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ? * ldh [16] ; load inner type - * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ? + * jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ? * ldb [18] ; load payload * and #0x8 ; as we don't have ports here, test * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these * ldh [18] ; reload payload * and #0xf ; mask PTP_CLASS_VMASK - * or #0x40 ; PTP_CLASS_V2_VLAN + * or #0x70 ; PTP_CLASS_VLAN|PTP_CLASS_L2 + * ret a ; return PTP class + * + * ; PTP over UDP over IPv4 over 802.1Q over Ethernet + * test_8021q_ipv4: + * jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ? + * ldb [27] ; load proto + * jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ? + * ldh [24] ; load frag offset field + * jset #0x1fff, drop_8021q_ipv4; don't allow fragments + * ldxb 4*([18]&0xf) ; load IP header len + * ldh [x + 20] ; load UDP dst port + * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ? + * ldh [x + 26] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x50 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4 + * ret a ; return PTP class + * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over UDP over IPv6 over 802.1Q over Ethernet + * test_8021q_ipv6: + * jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ? + * ldb [24] ; load proto + * jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ? + * ldh [60] ; load UDP dst port + * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ? + * ldh [66] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x60 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6 * ret a ; return PTP class + * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE * * ; PTP over Ethernet * test_ieee1588: @@ -78,11 +107,11 @@ #include <linux/filter.h> #include <linux/ptp_classify.h> -static struct sk_filter *ptp_insns __read_mostly; +static struct bpf_prog *ptp_insns __read_mostly; unsigned int ptp_classify_raw(const struct sk_buff *skb) { - return SK_RUN_FILTER(ptp_insns, skb); + return BPF_PROG_RUN(ptp_insns, skb); } EXPORT_SYMBOL_GPL(ptp_classify_raw); @@ -113,16 +142,39 @@ void __init ptp_classifier_init(void) { 0x44, 0, 0, 0x00000020 }, { 0x16, 0, 0, 0x00000000 }, { 0x06, 0, 0, 0x00000000 }, - { 0x15, 0, 9, 0x00008100 }, + { 0x15, 0, 32, 0x00008100 }, { 0x28, 0, 0, 0x00000010 }, - { 0x15, 0, 15, 0x000088f7 }, + { 0x15, 0, 7, 0x000088f7 }, { 0x30, 0, 0, 0x00000012 }, { 0x54, 0, 0, 0x00000008 }, - { 0x15, 0, 12, 0x00000000 }, + { 0x15, 0, 35, 0x00000000 }, { 0x28, 0, 0, 0x00000012 }, { 0x54, 0, 0, 0x0000000f }, - { 0x44, 0, 0, 0x00000040 }, + { 0x44, 0, 0, 0x00000070 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x0000001b }, + { 0x15, 0, 9, 0x00000011 }, + { 0x28, 0, 0, 0x00000018 }, + { 0x45, 7, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x00000012 }, + { 0x48, 0, 0, 0x00000014 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x48, 0, 0, 0x0000001a }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000050 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 8, 0x000086dd }, + { 0x30, 0, 0, 0x00000018 }, + { 0x15, 0, 6, 0x00000011 }, + { 0x28, 0, 0, 0x0000003c }, + { 0x15, 0, 4, 0x0000013f }, + { 0x28, 0, 0, 0x00000042 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000060 }, { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, { 0x15, 0, 7, 0x000088f7 }, { 0x30, 0, 0, 0x0000000e }, { 0x54, 0, 0, 0x00000008 }, @@ -137,5 +189,5 @@ void __init ptp_classifier_init(void) .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter, }; - BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog)); + BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog)); } diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 467f326126e0..04db318e6218 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -41,27 +41,27 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, unsigned int nr_table_entries) { size_t lopt_size = sizeof(struct listen_sock); - struct listen_sock *lopt; + struct listen_sock *lopt = NULL; nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); nr_table_entries = max_t(u32, nr_table_entries, 8); nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); lopt_size += nr_table_entries * sizeof(struct request_sock *); - if (lopt_size > PAGE_SIZE) + + if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + lopt = kzalloc(lopt_size, GFP_KERNEL | + __GFP_NOWARN | + __GFP_NORETRY); + if (!lopt) lopt = vzalloc(lopt_size); - else - lopt = kzalloc(lopt_size, GFP_KERNEL); - if (lopt == NULL) + if (!lopt) return -ENOMEM; - for (lopt->max_qlen_log = 3; - (1 << lopt->max_qlen_log) < nr_table_entries; - lopt->max_qlen_log++); - get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); rwlock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; lopt->nr_table_entries = nr_table_entries; + lopt->max_qlen_log = ilog2(nr_table_entries); write_lock_bh(&queue->syn_wait_lock); queue->listen_opt = lopt; @@ -72,22 +72,8 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, void __reqsk_queue_destroy(struct request_sock_queue *queue) { - struct listen_sock *lopt; - size_t lopt_size; - - /* - * this is an error recovery path only - * no locking needed and the lopt is not NULL - */ - - lopt = queue->listen_opt; - lopt_size = sizeof(struct listen_sock) + - lopt->nr_table_entries * sizeof(struct request_sock *); - - if (lopt_size > PAGE_SIZE) - vfree(lopt); - else - kfree(lopt); + /* This is an error recovery path only, no locking needed */ + kvfree(queue->listen_opt); } static inline struct listen_sock *reqsk_queue_yank_listen_sk( @@ -107,8 +93,6 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) { /* make all the listen_opt local to us */ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); - size_t lopt_size = sizeof(struct listen_sock) + - lopt->nr_table_entries * sizeof(struct request_sock *); if (lopt->qlen != 0) { unsigned int i; @@ -125,10 +109,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) } WARN_ON(lopt->qlen != 0); - if (lopt_size > PAGE_SIZE) - vfree(lopt); - else - kfree(lopt); + kvfree(lopt); } /* diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1063996f8317..a6882686ca3a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -299,7 +299,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if (rtnl_link_ops_get(ops->kind)) return -EEXIST; - if (!ops->dellink) + /* The check for setup is here because if ops + * does not have that filled up, it is not possible + * to use the ops for creating device. So do not + * fill up dellink as well. That disables rtnl_dellink. + */ + if (ops->setup && !ops->dellink) ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); @@ -799,7 +804,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, (nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + - nla_total_size(sizeof(struct ifla_vf_rate))); + nla_total_size(sizeof(struct ifla_vf_rate)) + + nla_total_size(sizeof(struct ifla_vf_link_state))); return size; } else return 0; @@ -1475,9 +1481,12 @@ static int do_set_master(struct net_device *dev, int ifindex) return 0; } +#define DO_SETLINK_MODIFIED 0x01 +/* notify flag means notify + modified. */ +#define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct ifinfomsg *ifm, - struct nlattr **tb, char *ifname, int modified) + struct nlattr **tb, char *ifname, int status) { const struct net_device_ops *ops = dev->netdev_ops; int err; @@ -1496,7 +1505,7 @@ static int do_setlink(const struct sk_buff *skb, put_net(net); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MAP]) { @@ -1525,7 +1534,7 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_ADDRESS]) { @@ -1545,19 +1554,19 @@ static int do_setlink(const struct sk_buff *skb, kfree(sa); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MTU]) { err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); - modified = 1; + status |= DO_SETLINK_NOTIFY; } /* @@ -1569,7 +1578,7 @@ static int do_setlink(const struct sk_buff *skb, err = dev_change_name(dev, ifname); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { @@ -1577,7 +1586,7 @@ static int do_setlink(const struct sk_buff *skb, nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_BROADCAST]) { @@ -1595,25 +1604,35 @@ static int do_setlink(const struct sk_buff *skb, err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; - modified = 1; + status |= DO_SETLINK_MODIFIED; } - if (tb[IFLA_TXQLEN]) - dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + if (tb[IFLA_TXQLEN]) { + unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); + + if (dev->tx_queue_len ^ value) + status |= DO_SETLINK_NOTIFY; + + dev->tx_queue_len = value; + } if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { + unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); + write_lock_bh(&dev_base_lock); - dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + if (dev->link_mode ^ value) + status |= DO_SETLINK_NOTIFY; + dev->link_mode = value; write_unlock_bh(&dev_base_lock); } @@ -1628,7 +1647,7 @@ static int do_setlink(const struct sk_buff *skb, err = do_setvfinfo(dev, attr); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; @@ -1658,7 +1677,7 @@ static int do_setlink(const struct sk_buff *skb, err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; @@ -1676,7 +1695,7 @@ static int do_setlink(const struct sk_buff *skb, err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_AF_SPEC]) { @@ -1693,15 +1712,20 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; - modified = 1; + status |= DO_SETLINK_NOTIFY; } } err = 0; errout: - if (err < 0 && modified) - net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", - dev->name); + if (status & DO_SETLINK_MODIFIED) { + if (status & DO_SETLINK_NOTIFY) + netdev_state_change(dev); + + if (err < 0) + net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", + dev->name); + } return err; } @@ -1777,7 +1801,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) return -ENODEV; ops = dev->rtnl_link_ops; - if (!ops) + if (!ops || !ops->dellink) return -EOPNOTSUPP; ops->dellink(dev, &list_kill); @@ -1805,7 +1829,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) EXPORT_SYMBOL(rtnl_configure_link); struct net_device *rtnl_create_link(struct net *net, - char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) + char *ifname, unsigned char name_assign_type, + const struct rtnl_link_ops *ops, struct nlattr *tb[]) { int err; struct net_device *dev; @@ -1823,8 +1848,8 @@ struct net_device *rtnl_create_link(struct net *net, num_rx_queues = ops->get_num_rx_queues(); err = -ENOMEM; - dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup, - num_tx_queues, num_rx_queues); + dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, + ops->setup, num_tx_queues, num_rx_queues); if (!dev) goto err; @@ -1889,6 +1914,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct nlattr *linkinfo[IFLA_INFO_MAX+1]; + unsigned char name_assign_type = NET_NAME_USER; int err; #ifdef CONFIG_MODULES @@ -1981,7 +2007,7 @@ replay: } if (dev) { - int modified = 0; + int status = 0; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; @@ -1996,7 +2022,7 @@ replay: err = ops->changelink(dev, tb, data); if (err < 0) return err; - modified = 1; + status |= DO_SETLINK_NOTIFY; } if (linkinfo[IFLA_INFO_SLAVE_DATA]) { @@ -2007,10 +2033,10 @@ replay: tb, slave_data); if (err < 0) return err; - modified = 1; + status |= DO_SETLINK_NOTIFY; } - return do_setlink(skb, dev, ifm, tb, ifname, modified); + return do_setlink(skb, dev, ifm, tb, ifname, status); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { @@ -2038,14 +2064,19 @@ replay: return -EOPNOTSUPP; } - if (!ifname[0]) + if (!ops->setup) + return -EOPNOTSUPP; + + if (!ifname[0]) { snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } dest_net = rtnl_link_get_net(net, tb); if (IS_ERR(dest_net)) return PTR_ERR(dest_net); - dev = rtnl_create_link(dest_net, ifname, ops, tb); + dev = rtnl_create_link(dest_net, ifname, name_assign_type, ops, tb); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; @@ -2380,22 +2411,20 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, struct net_device *dev, const unsigned char *addr) { - int err = -EOPNOTSUPP; + int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { pr_info("%s: FDB only supports static addresses\n", dev->name); - return -EINVAL; + return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); - else - err = -EINVAL; return err; } @@ -2509,6 +2538,7 @@ skip: int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, + struct net_device *filter_dev, int idx) { int err; @@ -2526,28 +2556,72 @@ EXPORT_SYMBOL(ndo_dflt_fdb_dump); static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { - int idx = 0; - struct net *net = sock_net(skb->sk); struct net_device *dev; + struct nlattr *tb[IFLA_MAX+1]; + struct net_device *bdev = NULL; + struct net_device *br_dev = NULL; + const struct net_device_ops *ops = NULL; + const struct net_device_ops *cops = NULL; + struct ifinfomsg *ifm = nlmsg_data(cb->nlh); + struct net *net = sock_net(skb->sk); + int brport_idx = 0; + int br_idx = 0; + int idx = 0; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - if (dev->priv_flags & IFF_BRIDGE_PORT) { - struct net_device *br_dev; - const struct net_device_ops *ops; + if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, + ifla_policy) == 0) { + if (tb[IFLA_MASTER]) + br_idx = nla_get_u32(tb[IFLA_MASTER]); + } + + brport_idx = ifm->ifi_index; + + if (br_idx) { + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) + return -ENODEV; - br_dev = netdev_master_upper_dev_get(dev); - ops = br_dev->netdev_ops; - if (ops->ndo_fdb_dump) - idx = ops->ndo_fdb_dump(skb, cb, dev, idx); + ops = br_dev->netdev_ops; + bdev = br_dev; + } + + for_each_netdev(net, dev) { + if (brport_idx && (dev->ifindex != brport_idx)) + continue; + + if (!br_idx) { /* user did not specify a specific bridge */ + if (dev->priv_flags & IFF_BRIDGE_PORT) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; + } + + bdev = dev; + } else { + if (dev != br_dev && + !(dev->priv_flags & IFF_BRIDGE_PORT)) + continue; + + if (br_dev != netdev_master_upper_dev_get(dev) && + !(dev->priv_flags & IFF_EBRIDGE)) + continue; + + bdev = br_dev; + cops = ops; } + if (dev->priv_flags & IFF_BRIDGE_PORT) { + if (cops && cops->ndo_fdb_dump) + idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, + idx); + } + + idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); if (dev->netdev_ops->ndo_fdb_dump) - idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx); - else - idx = ndo_dflt_fdb_dump(skb, cb, dev, idx); + idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, bdev, dev, + idx); + + cops = NULL; } - rcu_read_unlock(); cb->args[0] = idx; return skb->len; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index ba71212f0251..51dd3193a33e 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -35,7 +35,7 @@ static u32 seq_scale(u32 seq) * overlaps less than one time per MSL (2 minutes). * Choosing a clock of 64 ns period is OK. (period of 274 s) */ - return seq + (ktime_to_ns(ktime_get_real()) >> 6); + return seq + (ktime_get_real_ns() >> 6); } #endif @@ -135,7 +135,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, md5_transform(hash, net_secret); seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); + seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; @@ -163,7 +163,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, md5_transform(hash, secret); seq = hash[0] | (((u64)hash[1]) << 32); - seq += ktime_to_ns(ktime_get_real()); + seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c1a33033cbe2..829d013745ab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -62,6 +62,7 @@ #include <linux/scatterlist.h> #include <linux/errqueue.h> #include <linux/prefetch.h> +#include <linux/if_vlan.h> #include <net/protocol.h> #include <net/dst.h> @@ -256,16 +257,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, kmemcheck_annotate_variable(shinfo->destructor_arg); if (flags & SKB_ALLOC_FCLONE) { - struct sk_buff *child = skb + 1; - atomic_t *fclone_ref = (atomic_t *) (child + 1); + struct sk_buff_fclones *fclones; - kmemcheck_annotate_bitfield(child, flags1); - kmemcheck_annotate_bitfield(child, flags2); + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; - atomic_set(fclone_ref, 1); + atomic_set(&fclones->fclone_ref, 1); - child->fclone = SKB_FCLONE_UNAVAILABLE; - child->pfmemalloc = pfmemalloc; + fclones->skb2.fclone = SKB_FCLONE_FREE; + fclones->skb2.pfmemalloc = pfmemalloc; } out: return skb; @@ -359,18 +360,29 @@ refill: goto end; } nc->frag.size = PAGE_SIZE << order; -recycle: - atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS); + /* Even if we own the page, we do not use atomic_set(). + * This would break get_page_unless_zero() users. + */ + atomic_add(NETDEV_PAGECNT_MAX_BIAS - 1, + &nc->frag.page->_count); nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; nc->frag.offset = 0; } if (nc->frag.offset + fragsz > nc->frag.size) { - /* avoid unnecessary locked operations if possible */ - if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) || - atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count)) - goto recycle; - goto refill; + if (atomic_read(&nc->frag.page->_count) != nc->pagecnt_bias) { + if (!atomic_sub_and_test(nc->pagecnt_bias, + &nc->frag.page->_count)) + goto refill; + /* OK, page count is 0, we can safely set it */ + atomic_set(&nc->frag.page->_count, + NETDEV_PAGECNT_MAX_BIAS); + } else { + atomic_add(NETDEV_PAGECNT_MAX_BIAS - nc->pagecnt_bias, + &nc->frag.page->_count); + } + nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; + nc->frag.offset = 0; } data = page_address(nc->frag.page) + nc->frag.offset; @@ -490,32 +502,33 @@ static void skb_free_head(struct sk_buff *skb) static void skb_release_data(struct sk_buff *skb) { - if (!skb->cloned || - !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, - &skb_shinfo(skb)->dataref)) { - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - skb_frag_unref(skb, i); - } + struct skb_shared_info *shinfo = skb_shinfo(skb); + int i; - /* - * If skb buf is from userspace, we need to notify the caller - * the lower device DMA has done; - */ - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; + if (skb->cloned && + atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &shinfo->dataref)) + return; - uarg = skb_shinfo(skb)->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, true); - } + for (i = 0; i < shinfo->nr_frags; i++) + __skb_frag_unref(&shinfo->frags[i]); - if (skb_has_frag_list(skb)) - skb_drop_fraglist(skb); + /* + * If skb buf is from userspace, we need to notify the caller + * the lower device DMA has done; + */ + if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) { + struct ubuf_info *uarg; - skb_free_head(skb); + uarg = shinfo->destructor_arg; + if (uarg->callback) + uarg->callback(uarg, true); } + + if (shinfo->frag_list) + kfree_skb_list(shinfo->frag_list); + + skb_free_head(skb); } /* @@ -523,8 +536,7 @@ static void skb_release_data(struct sk_buff *skb) */ static void kfree_skbmem(struct sk_buff *skb) { - struct sk_buff *other; - atomic_t *fclone_ref; + struct sk_buff_fclones *fclones; switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: @@ -532,22 +544,28 @@ static void kfree_skbmem(struct sk_buff *skb) break; case SKB_FCLONE_ORIG: - fclone_ref = (atomic_t *) (skb + 2); - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, skb); + fclones = container_of(skb, struct sk_buff_fclones, skb1); + if (atomic_dec_and_test(&fclones->fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, fclones); break; case SKB_FCLONE_CLONE: - fclone_ref = (atomic_t *) (skb + 1); - other = skb - 1; + fclones = container_of(skb, struct sk_buff_fclones, skb2); - /* The clone portion is available for - * fast-cloning again. + /* Warning : We must perform the atomic_dec_and_test() before + * setting skb->fclone back to SKB_FCLONE_FREE, otherwise + * skb_clone() could set clone_ref to 2 before our decrement. + * Anyway, if we are going to free the structure, no need to + * rewrite skb->fclone. */ - skb->fclone = SKB_FCLONE_UNAVAILABLE; - - if (atomic_dec_and_test(fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, other); + if (atomic_dec_and_test(&fclones->fclone_ref)) { + kmem_cache_free(skbuff_fclone_cache, fclones); + } else { + /* The clone portion is available for + * fast-cloning again. + */ + skb->fclone = SKB_FCLONE_FREE; + } break; } } @@ -565,7 +583,7 @@ static void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); #endif /* XXX: IS this still necessary? - JHS */ @@ -673,57 +691,61 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); +/* Make sure a field is enclosed inside headers_start/headers_end section */ +#define CHECK_SKB_FIELD(field) \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ + offsetof(struct sk_buff, headers_start)); \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ + offsetof(struct sk_buff, headers_end)); \ + static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { new->tstamp = old->tstamp; + /* We do not copy old->sk */ new->dev = old->dev; - new->transport_header = old->transport_header; - new->network_header = old->network_header; - new->mac_header = old->mac_header; - new->inner_protocol = old->inner_protocol; - new->inner_transport_header = old->inner_transport_header; - new->inner_network_header = old->inner_network_header; - new->inner_mac_header = old->inner_mac_header; + memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); - skb_copy_hash(new, old); - new->ooo_okay = old->ooo_okay; - new->no_fcs = old->no_fcs; - new->encapsulation = old->encapsulation; - new->encap_hdr_csum = old->encap_hdr_csum; - new->csum_valid = old->csum_valid; - new->csum_complete_sw = old->csum_complete_sw; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif - memcpy(new->cb, old->cb, sizeof(old->cb)); - new->csum = old->csum; - new->ignore_df = old->ignore_df; - new->pkt_type = old->pkt_type; - new->ip_summed = old->ip_summed; - skb_copy_queue_mapping(new, old); - new->priority = old->priority; -#if IS_ENABLED(CONFIG_IP_VS) - new->ipvs_property = old->ipvs_property; + __nf_copy(new, old, false); + + /* Note : this field could be in headers_start/headers_end section + * It is not yet because we do not want to have a 16 bit hole + */ + new->queue_mapping = old->queue_mapping; + + memcpy(&new->headers_start, &old->headers_start, + offsetof(struct sk_buff, headers_end) - + offsetof(struct sk_buff, headers_start)); + CHECK_SKB_FIELD(protocol); + CHECK_SKB_FIELD(csum); + CHECK_SKB_FIELD(hash); + CHECK_SKB_FIELD(priority); + CHECK_SKB_FIELD(skb_iif); + CHECK_SKB_FIELD(vlan_proto); + CHECK_SKB_FIELD(vlan_tci); + CHECK_SKB_FIELD(transport_header); + CHECK_SKB_FIELD(network_header); + CHECK_SKB_FIELD(mac_header); + CHECK_SKB_FIELD(inner_protocol); + CHECK_SKB_FIELD(inner_transport_header); + CHECK_SKB_FIELD(inner_network_header); + CHECK_SKB_FIELD(inner_mac_header); + CHECK_SKB_FIELD(mark); +#ifdef CONFIG_NETWORK_SECMARK + CHECK_SKB_FIELD(secmark); +#endif +#ifdef CONFIG_NET_RX_BUSY_POLL + CHECK_SKB_FIELD(napi_id); #endif - new->pfmemalloc = old->pfmemalloc; - new->protocol = old->protocol; - new->mark = old->mark; - new->skb_iif = old->skb_iif; - __nf_copy(new, old); #ifdef CONFIG_NET_SCHED - new->tc_index = old->tc_index; + CHECK_SKB_FIELD(tc_index); #ifdef CONFIG_NET_CLS_ACT - new->tc_verd = old->tc_verd; + CHECK_SKB_FIELD(tc_verd); #endif #endif - new->vlan_proto = old->vlan_proto; - new->vlan_tci = old->vlan_tci; - - skb_copy_secmark(new, old); -#ifdef CONFIG_NET_RX_BUSY_POLL - new->napi_id = old->napi_id; -#endif } /* @@ -854,17 +876,22 @@ EXPORT_SYMBOL_GPL(skb_copy_ubufs); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { - struct sk_buff *n; + struct sk_buff_fclones *fclones = container_of(skb, + struct sk_buff_fclones, + skb1); + struct sk_buff *n = &fclones->skb2; if (skb_orphan_frags(skb, gfp_mask)) return NULL; - n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && - n->fclone == SKB_FCLONE_UNAVAILABLE) { - atomic_t *fclone_ref = (atomic_t *) (n + 1); + n->fclone == SKB_FCLONE_FREE) { n->fclone = SKB_FCLONE_CLONE; - atomic_inc(fclone_ref); + /* As our fastclone was free, clone_ref must be 1 at this point. + * We could use atomic_inc() here, but it is faster + * to set the final value. + */ + atomic_set(&fclones->fclone_ref, 2); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -874,7 +901,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL; kmemcheck_annotate_bitfield(n, flags1); - kmemcheck_annotate_bitfield(n, flags2); n->fclone = SKB_FCLONE_UNAVAILABLE; } @@ -2646,7 +2672,7 @@ EXPORT_SYMBOL(skb_prepare_seq_read); * skb_seq_read() will return the remaining part of the block. * * Note 1: The size of each block of data returned can be arbitrary, - * this limitation is the cost for zerocopy seqeuental + * this limitation is the cost for zerocopy sequential * reads of potentially non linear data. * * Note 2: Fragment lists within fragments are not implemented @@ -2780,7 +2806,7 @@ EXPORT_SYMBOL(skb_find_text); /** * skb_append_datato_frags - append the user data to a skb * @sk: sock structure - * @skb: skb structure to be appened with user data. + * @skb: skb structure to be appended with user data. * @getfrag: call back function to be used for getting the user data * @from: pointer to user message iov * @length: length of the iov message @@ -2976,9 +3002,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, tail = nskb; __copy_skb_header(nskb, head_skb); - nskb->mac_len = head_skb->mac_len; skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); + skb_reset_mac_len(nskb); skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, nskb->data - tnl_hlen, @@ -3068,6 +3094,11 @@ perform_csum_check: } } while ((offset += len) < head_skb->len); + /* Some callers want to get the end of the list. + * Put it in segs->prev to avoid walking the list. + * (see validate_xmit_skb_list() for example) + */ + segs->prev = tail; return segs; err: @@ -3151,6 +3182,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } + /* switch back to head shinfo */ + pinfo = skb_shinfo(p); + if (pinfo->frag_list) goto merge; if (skb_gro_len(p) != pinfo->gso_size) @@ -3178,7 +3212,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb_shinfo(nskb)->frag_list = p; skb_shinfo(nskb)->gso_size = pinfo->gso_size; pinfo->gso_size = 0; - skb_header_release(p); + __skb_header_release(p); NAPI_GRO_CB(nskb)->last = p; nskb->data_len += p->len; @@ -3210,7 +3244,7 @@ merge: else NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; - skb_header_release(skb); + __skb_header_release(skb); lp = p; done: @@ -3226,7 +3260,6 @@ done: NAPI_GRO_CB(skb)->same_flow = 1; return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive); void __init skb_init(void) { @@ -3236,8 +3269,7 @@ void __init skb_init(void) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", - (2*sizeof(struct sk_buff)) + - sizeof(atomic_t), + sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); @@ -3490,43 +3522,127 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); -void skb_tstamp_tx(struct sk_buff *orig_skb, - struct skb_shared_hwtstamps *hwtstamps) +struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { - struct sock *sk = orig_skb->sk; - struct sock_exterr_skb *serr; - struct sk_buff *skb; - int err; + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *skb_next; + int err = 0; - if (!sk) - return; + spin_lock_bh(&q->lock); + skb = __skb_dequeue(q); + if (skb && (skb_next = skb_peek(q))) + err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + spin_unlock_bh(&q->lock); - if (hwtstamps) { - *skb_hwtstamps(orig_skb) = - *hwtstamps; - } else { - /* - * no hardware time stamps available, - * so keep the shared tx_flags and only - * store software time stamp - */ - orig_skb->tstamp = ktime_get_real(); + sk->sk_err = err; + if (err) + sk->sk_error_report(sk); + + return skb; +} +EXPORT_SYMBOL(sock_dequeue_err_skb); + +/** + * skb_clone_sk - create clone of skb, and take reference to socket + * @skb: the skb to clone + * + * This function creates a clone of a buffer that holds a reference on + * sk_refcnt. Buffers created via this function are meant to be + * returned using sock_queue_err_skb, or free via kfree_skb. + * + * When passing buffers allocated with this function to sock_queue_err_skb + * it is necessary to wrap the call with sock_hold/sock_put in order to + * prevent the socket from being released prior to being enqueued on + * the sk_error_queue. + */ +struct sk_buff *skb_clone_sk(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct sk_buff *clone; + + if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt)) + return NULL; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return NULL; } - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) - return; + clone->sk = sk; + clone->destructor = sock_efree; + + return clone; +} +EXPORT_SYMBOL(skb_clone_sk); + +static void __skb_complete_tx_timestamp(struct sk_buff *skb, + struct sock *sk, + int tstype) +{ + struct sock_exterr_skb *serr; + int err; serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + serr->ee.ee_info = tstype; + if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { + serr->ee.ee_data = skb_shinfo(skb)->tskey; + if (sk->sk_protocol == IPPROTO_TCP) + serr->ee.ee_data -= sk->sk_tskey; + } err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); } + +void skb_complete_tx_timestamp(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = skb->sk; + + /* take a reference to prevent skb_orphan() from freeing the socket */ + sock_hold(sk); + + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); + + sock_put(sk); +} +EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); + +void __skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype) +{ + struct sk_buff *skb; + + if (!sk) + return; + + if (hwtstamps) + *skb_hwtstamps(orig_skb) = *hwtstamps; + else + orig_skb->tstamp = ktime_get_real(); + + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + + __skb_complete_tx_timestamp(skb, sk, tstype); +} +EXPORT_SYMBOL_GPL(__skb_tstamp_tx); + +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, + SCM_TSTAMP_SND); +} EXPORT_SYMBOL_GPL(skb_tstamp_tx); void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) @@ -3543,9 +3659,14 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; + /* take a reference to prevent skb_orphan() from freeing the socket */ + sock_hold(sk); + err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); + + sock_put(sk); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); @@ -3846,7 +3967,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return false; if (len <= skb_tailroom(to)) { - BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); + if (len) + BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); *delta_truesize = 0; return true; } @@ -3959,3 +4081,133 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) return shinfo->gso_size; } EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); + +static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) +{ + if (skb_cow(skb, skb_headroom(skb)) < 0) { + kfree_skb(skb); + return NULL; + } + + memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); + skb->mac_header += VLAN_HLEN; + return skb; +} + +struct sk_buff *skb_vlan_untag(struct sk_buff *skb) +{ + struct vlan_hdr *vhdr; + u16 vlan_tci; + + if (unlikely(vlan_tx_tag_present(skb))) { + /* vlan_tci is already set-up so leave this for another time */ + return skb; + } + + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + goto err_free; + + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) + goto err_free; + + vhdr = (struct vlan_hdr *)skb->data; + vlan_tci = ntohs(vhdr->h_vlan_TCI); + __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); + + skb_pull_rcsum(skb, VLAN_HLEN); + vlan_set_encap_proto(skb, vhdr); + + skb = skb_reorder_vlan_header(skb); + if (unlikely(!skb)) + goto err_free; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_reset_mac_len(skb); + + return skb; + +err_free: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(skb_vlan_untag); + +/** + * alloc_skb_with_frags - allocate skb with page frags + * + * @header_len: size of linear part + * @data_len: needed length in frags + * @max_page_order: max page order desired. + * @errcode: pointer to error code if any + * @gfp_mask: allocation mask + * + * This can be used to allocate a paged skb, given a maximal order for frags. + */ +struct sk_buff *alloc_skb_with_frags(unsigned long header_len, + unsigned long data_len, + int max_page_order, + int *errcode, + gfp_t gfp_mask) +{ + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + unsigned long chunk; + struct sk_buff *skb; + struct page *page; + gfp_t gfp_head; + int i; + + *errcode = -EMSGSIZE; + /* Note this test could be relaxed, if we succeed to allocate + * high order pages... + */ + if (npages > MAX_SKB_FRAGS) + return NULL; + + gfp_head = gfp_mask; + if (gfp_head & __GFP_WAIT) + gfp_head |= __GFP_REPEAT; + + *errcode = -ENOBUFS; + skb = alloc_skb(header_len, gfp_head); + if (!skb) + return NULL; + + skb->truesize += npages << PAGE_SHIFT; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages(gfp_mask | + __GFP_COMP | + __GFP_NOWARN | + __GFP_NORETRY, + order); + if (page) + goto fill_page; + /* Do not retry other high order allocations */ + order = 1; + max_page_order = 0; + } + order--; + } + page = alloc_page(gfp_mask); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; + } + return skb; + +failure: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(alloc_skb_with_frags); diff --git a/net/core/sock.c b/net/core/sock.c index 026e01f70274..b4f3ea2fce60 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -166,7 +166,7 @@ EXPORT_SYMBOL(sk_ns_capable); /** * sk_capable - Socket global capability test * @sk: Socket to use a capability on or through - * @cap: The global capbility to use + * @cap: The global capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in all user @@ -183,7 +183,7 @@ EXPORT_SYMBOL(sk_capable); * @sk: Socket to use a capability on or through * @cap: The capability to use * - * Test to see if the opener of the socket had when the socke was created + * Test to see if the opener of the socket had when the socket was created * and the current process has the capability @cap over the network namespace * the socket is a member of. */ @@ -437,7 +437,6 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags) int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; - int skb_len; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; @@ -459,13 +458,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) skb->dev = NULL; skb_set_owner_r(skb, sk); - /* Cache the SKB length before we tack it onto the receive - * queue. Once it is added it no longer belongs to us and - * may be freed by other threads of control pulling packets - * from the queue. - */ - skb_len = skb->len; - /* we escape from rcu protected region, make sure we dont leak * a norefcounted dst */ @@ -491,7 +483,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) skb->dev = NULL; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } @@ -848,24 +840,25 @@ set_rcvbuf: ret = -EINVAL; break; } - sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, - val & SOF_TIMESTAMPING_TX_HARDWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, - val & SOF_TIMESTAMPING_TX_SOFTWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, - val & SOF_TIMESTAMPING_RX_HARDWARE); + if (val & SOF_TIMESTAMPING_OPT_ID && + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { + if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_ESTABLISHED) { + ret = -EINVAL; + break; + } + sk->sk_tskey = tcp_sk(sk)->snd_una; + } else { + sk->sk_tskey = 0; + } + } + sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, - val & SOF_TIMESTAMPING_SOFTWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, - val & SOF_TIMESTAMPING_SYS_HARDWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, - val & SOF_TIMESTAMPING_RAW_HARDWARE); break; case SO_RCVLOWAT: @@ -1091,21 +1084,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_TIMESTAMPING: - v.val = 0; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) - v.val |= SOF_TIMESTAMPING_TX_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) - v.val |= SOF_TIMESTAMPING_RX_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) - v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) - v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; + v.val = sk->sk_tsflags; break; case SO_RCVTIMEO: @@ -1478,6 +1457,7 @@ static void sk_update_clone(const struct sock *sk, struct sock *newsk) struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct sock *newsk; + bool is_charged = true; newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); if (newsk != NULL) { @@ -1501,9 +1481,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_omem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&newsk->sk_async_wait_queue); -#endif spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); @@ -1522,9 +1499,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) - sk_filter_charge(newsk, filter); + /* though it's an empty new sock, the charging may fail + * if sysctl_optmem_max was changed between creation of + * original socket and cloning + */ + is_charged = sk_filter_charge(newsk, filter); - if (unlikely(xfrm_sk_clone_policy(newsk))) { + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; @@ -1653,18 +1634,24 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +void sock_efree(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_efree); + +#ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb) { struct sock *sk = skb->sk; -#ifdef CONFIG_INET if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_put(inet_twsk(sk)); else -#endif sock_put(sk); } EXPORT_SYMBOL(sock_edemux); +#endif kuid_t sock_i_uid(struct sock *sk) { @@ -1772,21 +1759,12 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order) { - struct sk_buff *skb = NULL; - unsigned long chunk; - gfp_t gfp_mask; + struct sk_buff *skb; long timeo; int err; - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - struct page *page; - int i; - - err = -EMSGSIZE; - if (npages > MAX_SKB_FRAGS) - goto failure; timeo = sock_sndtimeo(sk, noblock); - while (!skb) { + for (;;) { err = sock_error(sk); if (err != 0) goto failure; @@ -1795,63 +1773,27 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - err = -EAGAIN; - if (!timeo) - goto failure; - if (signal_pending(current)) - goto interrupted; - timeo = sock_wait_for_wmem(sk, timeo); - continue; - } - - err = -ENOBUFS; - gfp_mask = sk->sk_allocation; - if (gfp_mask & __GFP_WAIT) - gfp_mask |= __GFP_REPEAT; + if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) + break; - skb = alloc_skb(header_len, gfp_mask); - if (!skb) + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) goto failure; - - skb->truesize += data_len; - - for (i = 0; npages > 0; i++) { - int order = max_page_order; - - while (order) { - if (npages >= 1 << order) { - page = alloc_pages(sk->sk_allocation | - __GFP_COMP | - __GFP_NOWARN | - __GFP_NORETRY, - order); - if (page) - goto fill_page; - } - order--; - } - page = alloc_page(sk->sk_allocation); - if (!page) - goto failure; -fill_page: - chunk = min_t(unsigned long, data_len, - PAGE_SIZE << order); - skb_fill_page_desc(skb, i, page, 0, chunk); - data_len -= chunk; - npages -= 1 << order; - } + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); } - - skb_set_owner_w(skb, sk); + skb = alloc_skb_with_frags(header_len, data_len, max_page_order, + errcode, sk->sk_allocation); + if (skb) + skb_set_owner_w(skb, sk); return skb; interrupted: err = sock_intr_errno(timeo); failure: - kfree_skb(skb); *errcode = err; return NULL; } @@ -1871,16 +1813,14 @@ EXPORT_SYMBOL(sock_alloc_send_skb); * skb_page_frag_refill - check that a page_frag contains enough room * @sz: minimum size of the fragment we want to get * @pfrag: pointer to page_frag - * @prio: priority for memory allocation + * @gfp: priority for memory allocation * * Note: While this allocator tries to use high order pages, there is * no guarantee that allocations succeed. Therefore, @sz MUST be * less or equal than PAGE_SIZE. */ -bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) { - int order; - if (pfrag->page) { if (atomic_read(&pfrag->page->_count) == 1) { pfrag->offset = 0; @@ -1891,20 +1831,21 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio) put_page(pfrag->page); } - order = SKB_FRAG_PAGE_ORDER; - do { - gfp_t gfp = prio; - - if (order) - gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; - pfrag->page = alloc_pages(gfp, order); + pfrag->offset = 0; + if (SKB_FRAG_PAGE_ORDER) { + pfrag->page = alloc_pages(gfp | __GFP_COMP | + __GFP_NOWARN | __GFP_NORETRY, + SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { - pfrag->offset = 0; - pfrag->size = PAGE_SIZE << order; + pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; return true; } - } while (--order >= 0); - + } + pfrag->page = alloc_page(gfp); + if (likely(pfrag->page)) { + pfrag->size = PAGE_SIZE; + return true; + } return false; } EXPORT_SYMBOL(skb_page_frag_refill); @@ -2314,9 +2255,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&sk->sk_async_wait_queue); -#endif sk->sk_send_head = NULL; @@ -2504,11 +2442,11 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type) { struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; + struct sk_buff *skb; int copied, err; err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; @@ -2529,16 +2467,6 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_ERRQUEUE; err = copied; - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else - spin_unlock_bh(&sk->sk_error_queue.lock); - out_free_skb: kfree_skb(skb); out: diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a4216a4c9572..ad704c757bb4 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -68,8 +68,8 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, if (!filter) goto out; - fprog = filter->orig_prog; - flen = sk_filter_proglen(fprog); + fprog = filter->prog->orig_prog; + flen = bpf_classic_proglen(fprog); attr = nla_reserve(skb, attrtype, flen); if (attr == NULL) { diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 6521dfd8b7c8..43d3dd62fcc8 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -36,71 +36,25 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) { struct phy_device *phydev; struct sk_buff *clone; - struct sock *sk = skb->sk; unsigned int type; - if (!sk) + if (!skb->sk) return; type = classify(skb); + if (type == PTP_CLASS_NONE) + return; - switch (type) { - case PTP_CLASS_V1_IPV4: - case PTP_CLASS_V1_IPV6: - case PTP_CLASS_V2_IPV4: - case PTP_CLASS_V2_IPV6: - case PTP_CLASS_V2_L2: - case PTP_CLASS_V2_VLAN: - phydev = skb->dev->phydev; - if (likely(phydev->drv->txtstamp)) { - if (!atomic_inc_not_zero(&sk->sk_refcnt)) - return; - - clone = skb_clone(skb, GFP_ATOMIC); - if (!clone) { - sock_put(sk); - return; - } - - clone->sk = sk; - phydev->drv->txtstamp(phydev, clone, type); - } - break; - default: - break; + phydev = skb->dev->phydev; + if (likely(phydev->drv->txtstamp)) { + clone = skb_clone_sk(skb); + if (!clone) + return; + phydev->drv->txtstamp(phydev, clone, type); } } EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); -void skb_complete_tx_timestamp(struct sk_buff *skb, - struct skb_shared_hwtstamps *hwtstamps) -{ - struct sock *sk = skb->sk; - struct sock_exterr_skb *serr; - int err; - - if (!hwtstamps) { - sock_put(sk); - kfree_skb(skb); - return; - } - - *skb_hwtstamps(skb) = *hwtstamps; - - serr = SKB_EXT_ERR(skb); - memset(serr, 0, sizeof(*serr)); - serr->ee.ee_errno = ENOMSG; - serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; - skb->sk = NULL; - - err = sock_queue_err_skb(sk, skb); - - sock_put(sk); - if (err) - kfree_skb(skb); -} -EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); - bool skb_defer_rx_timestamp(struct sk_buff *skb) { struct phy_device *phydev; @@ -114,20 +68,12 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) __skb_pull(skb, ETH_HLEN); - switch (type) { - case PTP_CLASS_V1_IPV4: - case PTP_CLASS_V1_IPV6: - case PTP_CLASS_V2_IPV4: - case PTP_CLASS_V2_IPV6: - case PTP_CLASS_V2_L2: - case PTP_CLASS_V2_VLAN: - phydev = skb->dev->phydev; - if (likely(phydev->drv->rxtstamp)) - return phydev->drv->rxtstamp(phydev, skb, type); - break; - default: - break; - } + if (type == PTP_CLASS_NONE) + return false; + + phydev = skb->dev->phydev; + if (likely(phydev->drv->rxtstamp)) + return phydev->drv->rxtstamp(phydev, skb, type); return false; } diff --git a/net/core/user_dma.c b/net/core/user_dma.c deleted file mode 100644 index 1b5fefdb8198..000000000000 --- a/net/core/user_dma.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. - * Portions based on net/core/datagram.c and copyrighted by their authors. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ - -/* - * This code allows the net stack to make use of a DMA engine for - * skb to iovec copies. - */ - -#include <linux/dmaengine.h> -#include <linux/socket.h> -#include <linux/export.h> -#include <net/tcp.h> -#include <net/netdma.h> - -#define NET_DMA_DEFAULT_COPYBREAK 4096 - -int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; -EXPORT_SYMBOL(sysctl_tcp_dma_copybreak); - -/** - * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. - * @skb - buffer to copy - * @offset - offset in the buffer to start copying from - * @iovec - io vector to copy to - * @len - amount of data to copy from buffer to iovec - * @pinned_list - locked iovec buffer data - * - * Note: the iovec is modified during the copy. - */ -int dma_skb_copy_datagram_iovec(struct dma_chan *chan, - struct sk_buff *skb, int offset, struct iovec *to, - size_t len, struct dma_pinned_list *pinned_list) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - dma_cookie_t cookie = 0; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_memcpy_to_iovec(chan, to, pinned_list, - skb->data + offset, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - copy = end - offset; - if (copy > 0) { - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - - cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, - frag->page_offset + offset - start, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - copy = end - offset; - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_skb_copy_datagram_iovec(chan, frag_iter, - offset - start, - to, copy, - pinned_list); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - -end: - if (!len) { - skb->dma_cookie = cookie; - return cookie; - } - -fault: - return -EFAULT; -} diff --git a/net/core/utils.c b/net/core/utils.c index eed34338736c..efc76dd9dcd1 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -306,16 +306,14 @@ EXPORT_SYMBOL(in6_pton); void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, int pseudohdr) { - __be32 diff[] = { ~from, to }; if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_partial(diff, sizeof(diff), - ~csum_unfold(*sum))); + *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from), + to)); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_partial(diff, sizeof(diff), - ~skb->csum); + skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to); } else if (pseudohdr) - *sum = ~csum_fold(csum_partial(diff, sizeof(diff), - csum_unfold(*sum))); + *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from), + to)); } EXPORT_SYMBOL(inet_proto_csum_replace4); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index f8b98d89c285..ca11d283bbeb 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -471,7 +471,11 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh, id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); if (netdev->dcbnl_ops->getapp) { - up = netdev->dcbnl_ops->getapp(netdev, idtype, id); + ret = netdev->dcbnl_ops->getapp(netdev, idtype, id); + if (ret < 0) + return ret; + else + up = ret; } else { struct dcb_app app = { .selector = idtype, @@ -538,6 +542,8 @@ static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh, if (netdev->dcbnl_ops->setapp) { ret = netdev->dcbnl_ops->setapp(netdev, idtype, id, up); + if (ret < 0) + return ret; } else { struct dcb_app app; app.selector = idtype; @@ -1770,7 +1776,7 @@ EXPORT_SYMBOL(dcb_getapp); * * Priority 0 is an invalid priority in CEE spec. This routine * removes applications from the app list if the priority is - * set to zero. + * set to zero. Priority is expected to be 8-bit 802.1p user priority bitmap */ int dcb_setapp(struct net_device *dev, struct dcb_app *new) { @@ -1831,7 +1837,8 @@ EXPORT_SYMBOL(dcb_ieee_getapp_mask); * * This adds Application data to the list. Multiple application * entries may exists for the same selector and protocol as long - * as the priorities are different. + * as the priorities are different. Priority is expected to be a + * 3-bit unsigned integer */ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) { diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 597557254ddb..83498975165f 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -99,7 +99,7 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab) kmem_cache_destroy(slab); } -static int ccid_activate(struct ccid_operations *ccid_ops) +static int __init ccid_activate(struct ccid_operations *ccid_ops) { int err = -ENOBUFS; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4db3c2a1679c..ad2acfe1ca61 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -386,7 +386,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = inet6_reqsk_alloc(&dccp6_request_sock_ops); + req = inet_reqsk_alloc(&dccp6_request_sock_ops); if (req == NULL) goto drop; @@ -404,7 +404,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (ipv6_opt_accepted(sk, skb) || + if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { atomic_inc(&skb->users); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index c69eb9c4fbb8..b50dc436db1f 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -55,11 +55,9 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { - const struct ipv6_pinfo *np = inet6_sk(sk); - tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - tw->tw_ipv6only = np->ipv6only; + tw->tw_ipv6only = sk->sk_ipv6only; } #endif /* Linkage updates. */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index de2c1e719305..5ab6627cf370 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -848,7 +848,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, default: dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); } verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { @@ -905,7 +905,7 @@ verify_sock_status: len = skb->len; found_fin_ok: if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); break; } while (1); out: @@ -1082,7 +1082,7 @@ void dccp_shutdown(struct sock *sk, int how) EXPORT_SYMBOL_GPL(dccp_shutdown); -static inline int dccp_mib_init(void) +static inline int __init dccp_mib_init(void) { dccp_statistics = alloc_percpu(struct dccp_mib); if (!dccp_statistics) @@ -1115,7 +1115,7 @@ static int __init dccp_init(void) BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); - rc = percpu_counter_init(&dccp_orphan_count, 0); + rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); if (rc) goto out_fail; rc = -ENOBUFS; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index ae011b46c071..25733d538147 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -127,6 +127,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat #include <linux/stat.h> #include <linux/init.h> #include <linux/poll.h> +#include <linux/jiffies.h> #include <net/net_namespace.h> #include <net/neighbour.h> #include <net/dst.h> @@ -598,7 +599,7 @@ int dn_destroy_timer(struct sock *sk) if (sk->sk_socket) return 0; - if ((jiffies - scp->stamp) >= (HZ * decnet_time_wait)) { + if (time_after_eq(jiffies, scp->stamp + HZ * decnet_time_wait)) { dn_unhash_sock(sk); sock_put(sk); return 1; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 3b726f31c64c..4400da7739da 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -41,6 +41,7 @@ #include <linux/sysctl.h> #include <linux/notifier.h> #include <linux/slab.h> +#include <linux/jiffies.h> #include <asm/uaccess.h> #include <net/net_namespace.h> #include <net/neighbour.h> @@ -875,7 +876,7 @@ static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa) static int dn_am_i_a_router(struct dn_neigh *dn, struct dn_dev *dn_db, struct dn_ifaddr *ifa) { /* First check time since device went up */ - if ((jiffies - dn_db->uptime) < DRDELAY) + if (time_before(jiffies, dn_db->uptime + DRDELAY)) return 0; /* If there is no router, then yes... */ diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c index d9c150cc59a9..1d330fd43dc7 100644 --- a/net/decnet/dn_timer.c +++ b/net/decnet/dn_timer.c @@ -23,6 +23,7 @@ #include <linux/spinlock.h> #include <net/sock.h> #include <linux/atomic.h> +#include <linux/jiffies.h> #include <net/flow.h> #include <net/dn.h> @@ -91,7 +92,7 @@ static void dn_slow_timer(unsigned long arg) * since the last successful transmission. */ if (scp->keepalive && scp->keepalive_fxn && (scp->state == DN_RUN)) { - if ((jiffies - scp->stamp) >= scp->keepalive) + if (time_after_eq(jiffies, scp->stamp + scp->keepalive)) scp->keepalive_fxn(sk); } diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index f5eede1d6cb8..a585fd6352eb 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -12,6 +12,9 @@ config NET_DSA if NET_DSA # tagging formats +config NET_DSA_TAG_BRCM + bool + config NET_DSA_TAG_DSA bool diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 7b9fcbbeda5d..da06ed1df620 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_NET_DSA) += dsa_core.o dsa_core-y += dsa.o slave.o # tagging formats +dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 5db37cef50a9..22f34cf4cb27 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -10,7 +10,6 @@ */ #include <linux/list.h> -#include <linux/netdevice.h> #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/module.h> @@ -44,7 +43,7 @@ void unregister_switch_driver(struct dsa_switch_driver *drv) EXPORT_SYMBOL_GPL(unregister_switch_driver); static struct dsa_switch_driver * -dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) +dsa_switch_probe(struct device *host_dev, int sw_addr, char **_name) { struct dsa_switch_driver *ret; struct list_head *list; @@ -59,7 +58,7 @@ dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) drv = list_entry(list, struct dsa_switch_driver, list); - name = drv->probe(bus, sw_addr); + name = drv->probe(host_dev, sw_addr); if (name != NULL) { ret = drv; break; @@ -76,7 +75,7 @@ dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name) /* basic switch operations **************************************************/ static struct dsa_switch * dsa_switch_setup(struct dsa_switch_tree *dst, int index, - struct device *parent, struct mii_bus *bus) + struct device *parent, struct device *host_dev) { struct dsa_chip_data *pd = dst->pd->chip + index; struct dsa_switch_driver *drv; @@ -89,7 +88,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, /* * Probe for switch model. */ - drv = dsa_switch_probe(bus, pd->sw_addr, &name); + drv = dsa_switch_probe(host_dev, pd->sw_addr, &name); if (drv == NULL) { printk(KERN_ERR "%s[%d]: could not detect attached switch\n", dst->master_netdev->name, index); @@ -110,8 +109,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, ds->index = index; ds->pd = dst->pd->chip + index; ds->drv = drv; - ds->master_mii_bus = bus; - + ds->master_dev = host_dev; /* * Validate supplied switch configuration. @@ -144,14 +142,44 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, goto out; } + /* Make the built-in MII bus mask match the number of ports, + * switch drivers can override this later + */ + ds->phys_mii_mask = ds->phys_port_mask; + /* * If the CPU connects to this switch, set the switch tree * tagging protocol to the preferred tagging format of this * switch. */ - if (ds->dst->cpu_switch == index) - ds->dst->tag_protocol = drv->tag_protocol; + if (dst->cpu_switch == index) { + switch (drv->tag_protocol) { +#ifdef CONFIG_NET_DSA_TAG_DSA + case DSA_TAG_PROTO_DSA: + dst->rcv = dsa_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_EDSA + case DSA_TAG_PROTO_EDSA: + dst->rcv = edsa_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_TRAILER + case DSA_TAG_PROTO_TRAILER: + dst->rcv = trailer_netdev_ops.rcv; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM + case DSA_TAG_PROTO_BRCM: + dst->rcv = brcm_netdev_ops.rcv; + break; +#endif + default: + break; + } + dst->tag_protocol = drv->tag_protocol; + } /* * Do basic register setup. @@ -210,6 +238,51 @@ static void dsa_switch_destroy(struct dsa_switch *ds) { } +#ifdef CONFIG_PM_SLEEP +static int dsa_switch_suspend(struct dsa_switch *ds) +{ + int i, ret = 0; + + /* Suspend slave network devices */ + for (i = 0; i < DSA_MAX_PORTS; i++) { + if (!(ds->phys_port_mask & (1 << i))) + continue; + + ret = dsa_slave_suspend(ds->ports[i]); + if (ret) + return ret; + } + + if (ds->drv->suspend) + ret = ds->drv->suspend(ds); + + return ret; +} + +static int dsa_switch_resume(struct dsa_switch *ds) +{ + int i, ret = 0; + + if (ds->drv->resume) + ret = ds->drv->resume(ds); + + if (ret) + return ret; + + /* Resume slave network devices */ + for (i = 0; i < DSA_MAX_PORTS; i++) { + if (!(ds->phys_port_mask & (1 << i))) + continue; + + ret = dsa_slave_resume(ds->ports[i]); + if (ret) + return ret; + } + + return 0; +} +#endif + /* link polling *************************************************************/ static void dsa_link_poll_work(struct work_struct *ugly) @@ -256,7 +329,7 @@ static struct device *dev_find_class(struct device *parent, char *class) return device_find_child(parent, class, dev_is_class); } -static struct mii_bus *dev_to_mii_bus(struct device *dev) +struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev) { struct device *d; @@ -272,6 +345,7 @@ static struct mii_bus *dev_to_mii_bus(struct device *dev) return NULL; } +EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus); static struct net_device *dev_to_net_device(struct device *dev) { @@ -351,8 +425,7 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) for (i = 0; i < pd->nr_chips; i++) { port_index = 0; while (port_index < DSA_MAX_PORTS) { - if (pd->chip[i].port_names[port_index]) - kfree(pd->chip[i].port_names[port_index]); + kfree(pd->chip[i].port_names[port_index]); port_index++; } kfree(pd->chip[i].rtable); @@ -411,7 +484,8 @@ static int dsa_of_probe(struct platform_device *pdev) chip_index++; cd = &pd->chip[chip_index]; - cd->mii_bus = &mdio_bus->dev; + cd->of_node = child; + cd->host_dev = &mdio_bus->dev; sw_addr = of_get_property(child, "reg", NULL); if (!sw_addr) @@ -432,6 +506,8 @@ static int dsa_of_probe(struct platform_device *pdev) if (!port_name) continue; + cd->port_dn[port_index] = port; + cd->port_names[port_index] = kstrdup(port_name, GFP_KERNEL); if (!cd->port_names[port_index]) { @@ -535,17 +611,9 @@ static int dsa_probe(struct platform_device *pdev) dst->cpu_port = -1; for (i = 0; i < pd->nr_chips; i++) { - struct mii_bus *bus; struct dsa_switch *ds; - bus = dev_to_mii_bus(pd->chip[i].mii_bus); - if (bus == NULL) { - printk(KERN_ERR "%s[%d]: no mii bus found for " - "dsa switch\n", dev->name, i); - continue; - } - - ds = dsa_switch_setup(dst, i, &pdev->dev, bus); + ds = dsa_switch_setup(dst, i, &pdev->dev, pd->chip[i].host_dev); if (IS_ERR(ds)) { printk(KERN_ERR "%s[%d]: couldn't create dsa switch " "instance (error %ld)\n", dev->name, i, @@ -609,7 +677,62 @@ static void dsa_shutdown(struct platform_device *pdev) { } +static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + + if (unlikely(dst == NULL)) { + kfree_skb(skb); + return 0; + } + + return dst->rcv(skb, dev, pt, orig_dev); +} + +static struct packet_type dsa_pack_type __read_mostly = { + .type = cpu_to_be16(ETH_P_XDSA), + .func = dsa_switch_rcv, +}; + +#ifdef CONFIG_PM_SLEEP +static int dsa_suspend(struct device *d) +{ + struct platform_device *pdev = to_platform_device(d); + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + int i, ret = 0; + + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL) + ret = dsa_switch_suspend(ds); + } + + return ret; +} + +static int dsa_resume(struct device *d) +{ + struct platform_device *pdev = to_platform_device(d); + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + int i, ret = 0; + + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL) + ret = dsa_switch_resume(ds); + } + + return ret; +} +#endif + +static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume); + static const struct of_device_id dsa_of_match_table[] = { + { .compatible = "brcm,bcm7445-switch-v4.0" }, { .compatible = "marvell,dsa", }, {} }; @@ -623,6 +746,7 @@ static struct platform_driver dsa_driver = { .name = "dsa", .owner = THIS_MODULE, .of_match_table = dsa_of_match_table, + .pm = &dsa_pm_ops, }, }; @@ -634,30 +758,15 @@ static int __init dsa_init_module(void) if (rc) return rc; -#ifdef CONFIG_NET_DSA_TAG_DSA - dev_add_pack(&dsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - dev_add_pack(&edsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - dev_add_pack(&trailer_packet_type); -#endif + dev_add_pack(&dsa_pack_type); + return 0; } module_init(dsa_init_module); static void __exit dsa_cleanup_module(void) { -#ifdef CONFIG_NET_DSA_TAG_TRAILER - dev_remove_pack(&trailer_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - dev_remove_pack(&edsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_DSA - dev_remove_pack(&dsa_packet_type); -#endif + dev_remove_pack(&dsa_pack_type); platform_driver_unregister(&dsa_driver); } module_exit(dsa_cleanup_module); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index d4cf5cc747e3..dc9756d3154c 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -12,7 +12,13 @@ #define __DSA_PRIV_H #include <linux/phy.h> -#include <net/dsa.h> +#include <linux/netdevice.h> + +struct dsa_device_ops { + netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); + int (*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); +}; struct dsa_slave_priv { /* @@ -20,6 +26,8 @@ struct dsa_slave_priv { * switch port. */ struct net_device *dev; + netdev_tx_t (*xmit)(struct sk_buff *skb, + struct net_device *dev); /* * Which switch this port is a part of, and the port index @@ -33,28 +41,35 @@ struct dsa_slave_priv { * to this port. */ struct phy_device *phy; + phy_interface_t phy_interface; + int old_link; + int old_pause; + int old_duplex; }; /* dsa.c */ extern char dsa_driver_version[]; /* slave.c */ +extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); struct net_device *dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name); +int dsa_slave_suspend(struct net_device *slave_dev); +int dsa_slave_resume(struct net_device *slave_dev); /* tag_dsa.c */ -netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type dsa_packet_type; +extern const struct dsa_device_ops dsa_netdev_ops; /* tag_edsa.c */ -netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type edsa_packet_type; +extern const struct dsa_device_ops edsa_netdev_ops; /* tag_trailer.c */ -netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type trailer_packet_type; +extern const struct dsa_device_ops trailer_netdev_ops; + +/* tag_brcm.c */ +extern const struct dsa_device_ops brcm_netdev_ops; #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 64c5af0a10dd..8030489d9cbe 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -9,9 +9,10 @@ */ #include <linux/list.h> -#include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/phy.h> +#include <linux/of_net.h> +#include <linux/of_mdio.h> #include "dsa_priv.h" /* slave mii_bus handling ***************************************************/ @@ -19,7 +20,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { struct dsa_switch *ds = bus->priv; - if (ds->phys_port_mask & (1 << addr)) + if (ds->phys_mii_mask & (1 << addr)) return ds->drv->phy_read(ds, addr, reg); return 0xffff; @@ -29,7 +30,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) { struct dsa_switch *ds = bus->priv; - if (ds->phys_port_mask & (1 << addr)) + if (ds->phys_mii_mask & (1 << addr)) return ds->drv->phy_write(ds, addr, reg, val); return 0; @@ -43,7 +44,7 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->write = dsa_slave_phy_write; snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x", ds->index, ds->pd->sw_addr); - ds->slave_mii_bus->parent = &ds->master_mii_bus->dev; + ds->slave_mii_bus->parent = ds->master_dev; } @@ -61,6 +62,7 @@ static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = p->parent->dst->master_netdev; + struct dsa_switch *ds = p->parent; int err; if (!(master->flags & IFF_UP)) @@ -83,8 +85,20 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } + if (ds->drv->port_enable) { + err = ds->drv->port_enable(ds, p->port, p->phy); + if (err) + goto clear_promisc; + } + + if (p->phy) + phy_start(p->phy); + return 0; +clear_promisc: + if (dev->flags & IFF_PROMISC) + dev_set_promiscuity(master, 0); clear_allmulti: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(master, -1); @@ -99,6 +113,10 @@ static int dsa_slave_close(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = p->parent->dst->master_netdev; + struct dsa_switch *ds = p->parent; + + if (p->phy) + phy_stop(p->phy); dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); @@ -110,6 +128,9 @@ static int dsa_slave_close(struct net_device *dev) if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); + if (ds->drv->port_disable) + ds->drv->port_disable(ds, p->port, p->phy); + return 0; } @@ -171,6 +192,24 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } +static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + return p->xmit(skb, dev); +} + +static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + skb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(skb); + + return NETDEV_TX_OK; +} + /* ethtool operations *******************************************************/ static int @@ -282,6 +321,65 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) return -EOPNOTSUPP; } +static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->drv->get_wol) + ds->drv->get_wol(ds, p->port, w); +} + +static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret = -EOPNOTSUPP; + + if (ds->drv->set_wol) + ret = ds->drv->set_wol(ds, p->port, w); + + return ret; +} + +static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret; + + if (!ds->drv->set_eee) + return -EOPNOTSUPP; + + ret = ds->drv->set_eee(ds, p->port, p->phy, e); + if (ret) + return ret; + + if (p->phy) + ret = phy_ethtool_set_eee(p->phy, e); + + return ret; +} + +static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret; + + if (!ds->drv->get_eee) + return -EOPNOTSUPP; + + ret = ds->drv->get_eee(ds, p->port, e); + if (ret) + return ret; + + if (p->phy) + ret = phy_ethtool_get_eee(p->phy, e); + + return ret; +} + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_settings = dsa_slave_get_settings, .set_settings = dsa_slave_set_settings, @@ -291,46 +389,143 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_strings = dsa_slave_get_strings, .get_ethtool_stats = dsa_slave_get_ethtool_stats, .get_sset_count = dsa_slave_get_sset_count, + .set_wol = dsa_slave_set_wol, + .get_wol = dsa_slave_get_wol, + .set_eee = dsa_slave_set_eee, + .get_eee = dsa_slave_get_eee, }; -#ifdef CONFIG_NET_DSA_TAG_DSA -static const struct net_device_ops dsa_netdev_ops = { +static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, - .ndo_start_xmit = dsa_xmit, + .ndo_start_xmit = dsa_slave_xmit, .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, .ndo_do_ioctl = dsa_slave_ioctl, }; -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA -static const struct net_device_ops edsa_netdev_ops = { - .ndo_init = dsa_slave_init, - .ndo_open = dsa_slave_open, - .ndo_stop = dsa_slave_close, - .ndo_start_xmit = edsa_xmit, - .ndo_change_rx_flags = dsa_slave_change_rx_flags, - .ndo_set_rx_mode = dsa_slave_set_rx_mode, - .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_do_ioctl = dsa_slave_ioctl, -}; -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER -static const struct net_device_ops trailer_netdev_ops = { - .ndo_init = dsa_slave_init, - .ndo_open = dsa_slave_open, - .ndo_stop = dsa_slave_close, - .ndo_start_xmit = trailer_xmit, - .ndo_change_rx_flags = dsa_slave_change_rx_flags, - .ndo_set_rx_mode = dsa_slave_set_rx_mode, - .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_do_ioctl = dsa_slave_ioctl, -}; -#endif + +static void dsa_slave_adjust_link(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + unsigned int status_changed = 0; + + if (p->old_link != p->phy->link) { + status_changed = 1; + p->old_link = p->phy->link; + } + + if (p->old_duplex != p->phy->duplex) { + status_changed = 1; + p->old_duplex = p->phy->duplex; + } + + if (p->old_pause != p->phy->pause) { + status_changed = 1; + p->old_pause = p->phy->pause; + } + + if (ds->drv->adjust_link && status_changed) + ds->drv->adjust_link(ds, p->port, p->phy); + + if (status_changed) + phy_print_status(p->phy); +} + +static int dsa_slave_fixed_link_update(struct net_device *dev, + struct fixed_phy_status *status) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->drv->fixed_link_update) + ds->drv->fixed_link_update(ds, p->port, status); + + return 0; +} /* slave device setup *******************************************************/ +static void dsa_slave_phy_setup(struct dsa_slave_priv *p, + struct net_device *slave_dev) +{ + struct dsa_switch *ds = p->parent; + struct dsa_chip_data *cd = ds->pd; + struct device_node *phy_dn, *port_dn; + bool phy_is_fixed = false; + u32 phy_flags = 0; + int ret; + + port_dn = cd->port_dn[p->port]; + p->phy_interface = of_get_phy_mode(port_dn); + + phy_dn = of_parse_phandle(port_dn, "phy-handle", 0); + if (of_phy_is_fixed_link(port_dn)) { + /* In the case of a fixed PHY, the DT node associated + * to the fixed PHY is the Port DT node + */ + ret = of_phy_register_fixed_link(port_dn); + if (ret) { + pr_err("failed to register fixed PHY\n"); + return; + } + phy_is_fixed = true; + phy_dn = port_dn; + } + + if (ds->drv->get_phy_flags) + phy_flags = ds->drv->get_phy_flags(ds, p->port); + + if (phy_dn) + p->phy = of_phy_connect(slave_dev, phy_dn, + dsa_slave_adjust_link, phy_flags, + p->phy_interface); + + if (p->phy && phy_is_fixed) + fixed_phy_set_link_update(p->phy, dsa_slave_fixed_link_update); + + /* We could not connect to a designated PHY, so use the switch internal + * MDIO bus instead + */ + if (!p->phy) + p->phy = ds->slave_mii_bus->phy_map[p->port]; + else + pr_info("attached PHY at address %d [%s]\n", + p->phy->addr, p->phy->drv->name); +} + +int dsa_slave_suspend(struct net_device *slave_dev) +{ + struct dsa_slave_priv *p = netdev_priv(slave_dev); + + netif_device_detach(slave_dev); + + if (p->phy) { + phy_stop(p->phy); + p->old_pause = -1; + p->old_link = -1; + p->old_duplex = -1; + phy_suspend(p->phy); + } + + return 0; +} + +int dsa_slave_resume(struct net_device *slave_dev) +{ + struct dsa_slave_priv *p = netdev_priv(slave_dev); + + netif_device_attach(slave_dev); + + if (p->phy) { + phy_resume(p->phy); + phy_start(p->phy); + } + + return 0; +} + struct net_device * dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name) @@ -340,8 +535,8 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, struct dsa_slave_priv *p; int ret; - slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), - name, ether_setup); + slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), name, + NET_NAME_UNKNOWN, ether_setup); if (slave_dev == NULL) return slave_dev; @@ -349,35 +544,48 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; eth_hw_addr_inherit(slave_dev, master); slave_dev->tx_queue_len = 0; + slave_dev->netdev_ops = &dsa_slave_netdev_ops; + + SET_NETDEV_DEV(slave_dev, parent); + slave_dev->dev.of_node = ds->pd->port_dn[port]; + slave_dev->vlan_features = master->vlan_features; + + p = netdev_priv(slave_dev); + p->dev = slave_dev; + p->parent = ds; + p->port = port; switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA - case htons(ETH_P_DSA): - slave_dev->netdev_ops = &dsa_netdev_ops; + case DSA_TAG_PROTO_DSA: + p->xmit = dsa_netdev_ops.xmit; break; #endif #ifdef CONFIG_NET_DSA_TAG_EDSA - case htons(ETH_P_EDSA): - slave_dev->netdev_ops = &edsa_netdev_ops; + case DSA_TAG_PROTO_EDSA: + p->xmit = edsa_netdev_ops.xmit; break; #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER - case htons(ETH_P_TRAILER): - slave_dev->netdev_ops = &trailer_netdev_ops; + case DSA_TAG_PROTO_TRAILER: + p->xmit = trailer_netdev_ops.xmit; + break; +#endif +#ifdef CONFIG_NET_DSA_TAG_BRCM + case DSA_TAG_PROTO_BRCM: + p->xmit = brcm_netdev_ops.xmit; break; #endif default: - BUG(); + p->xmit = dsa_slave_notag_xmit; + break; } - SET_NETDEV_DEV(slave_dev, parent); - slave_dev->vlan_features = master->vlan_features; + p->old_pause = -1; + p->old_link = -1; + p->old_duplex = -1; - p = netdev_priv(slave_dev); - p->dev = slave_dev; - p->parent = ds; - p->port = port; - p->phy = ds->slave_mii_bus->phy_map[port]; + dsa_slave_phy_setup(p, slave_dev); ret = register_netdev(slave_dev); if (ret) { @@ -390,6 +598,9 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, netif_carrier_off(slave_dev); if (p->phy != NULL) { + if (ds->drv->get_phy_flags(ds, port)) + p->phy->dev_flags |= ds->drv->get_phy_flags(ds, port); + phy_attach(slave_dev, dev_name(&p->phy->dev), PHY_INTERFACE_MODE_GMII); @@ -397,7 +608,6 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, p->phy->speed = 0; p->phy->duplex = 0; p->phy->advertising = p->phy->supported | ADVERTISED_Autoneg; - phy_start_aneg(p->phy); } return slave_dev; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c new file mode 100644 index 000000000000..83d3572cdb20 --- /dev/null +++ b/net/dsa/tag_brcm.c @@ -0,0 +1,171 @@ +/* + * Broadcom tag support + * + * Copyright (C) 2014 Broadcom Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/etherdevice.h> +#include <linux/list.h> +#include <linux/slab.h> +#include "dsa_priv.h" + +/* This tag length is 4 bytes, older ones were 6 bytes, we do not + * handle them + */ +#define BRCM_TAG_LEN 4 + +/* Tag is constructed and desconstructed using byte by byte access + * because the tag is placed after the MAC Source Address, which does + * not make it 4-bytes aligned, so this might cause unaligned accesses + * on most systems where this is used. + */ + +/* Ingress and egress opcodes */ +#define BRCM_OPCODE_SHIFT 5 +#define BRCM_OPCODE_MASK 0x7 + +/* Ingress fields */ +/* 1st byte in the tag */ +#define BRCM_IG_TC_SHIFT 2 +#define BRCM_IG_TC_MASK 0x7 +/* 2nd byte in the tag */ +#define BRCM_IG_TE_MASK 0x3 +#define BRCM_IG_TS_SHIFT 7 +/* 3rd byte in the tag */ +#define BRCM_IG_DSTMAP2_MASK 1 +#define BRCM_IG_DSTMAP1_MASK 0xff + +/* Egress fields */ + +/* 2nd byte in the tag */ +#define BRCM_EG_CID_MASK 0xff + +/* 3rd byte in the tag */ +#define BRCM_EG_RC_MASK 0xff +#define BRCM_EG_RC_RSVD (3 << 6) +#define BRCM_EG_RC_EXCEPTION (1 << 5) +#define BRCM_EG_RC_PROT_SNOOP (1 << 4) +#define BRCM_EG_RC_PROT_TERM (1 << 3) +#define BRCM_EG_RC_SWITCH (1 << 2) +#define BRCM_EG_RC_MAC_LEARN (1 << 1) +#define BRCM_EG_RC_MIRROR (1 << 0) +#define BRCM_EG_TC_SHIFT 5 +#define BRCM_EG_TC_MASK 0x7 +#define BRCM_EG_PID_MASK 0x1f + +static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + u8 *brcm_tag; + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + if (skb_cow_head(skb, BRCM_TAG_LEN) < 0) + goto out_free; + + skb_push(skb, BRCM_TAG_LEN); + + memmove(skb->data, skb->data + BRCM_TAG_LEN, 2 * ETH_ALEN); + + /* Build the tag after the MAC Source Address */ + brcm_tag = skb->data + 2 * ETH_ALEN; + + /* Set the ingress opcode, traffic class, tag enforcment is + * deprecated + */ + brcm_tag[0] = (1 << BRCM_OPCODE_SHIFT) | + ((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK); + brcm_tag[1] = 0; + brcm_tag[2] = 0; + if (p->port == 8) + brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; + brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK; + + /* Queue the SKB for transmission on the parent interface, but + * do not modify its EtherType + */ + skb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(skb); + + return NETDEV_TX_OK; + +out_free: + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + int source_port; + u8 *brcm_tag; + + if (unlikely(dst == NULL)) + goto out_drop; + + ds = dst->ds[0]; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (skb == NULL) + goto out; + + if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN))) + goto out_drop; + + /* skb->data points to the EtherType, the tag is right before it */ + brcm_tag = skb->data - 2; + + /* The opcode should never be different than 0b000 */ + if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK)) + goto out_drop; + + /* We should never see a reserved reason code without knowing how to + * handle it + */ + WARN_ON(brcm_tag[2] & BRCM_EG_RC_RSVD); + + /* Locate which port this is coming from */ + source_port = brcm_tag[3] & BRCM_EG_PID_MASK; + + /* Validate port against switch setup, either the port is totally */ + if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) + goto out_drop; + + /* Remove Broadcom tag and update checksum */ + skb_pull_rcsum(skb, BRCM_TAG_LEN); + + /* Move the Ethernet DA and SA */ + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - BRCM_TAG_LEN, + 2 * ETH_ALEN); + + skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->dev = ds->ports[source_port]; + skb->protocol = eth_type_trans(skb, skb->dev); + + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; + + netif_receive_skb(skb); + + return 0; + +out_drop: + kfree_skb(skb); +out: + return 0; +} + +const struct dsa_device_ops brcm_netdev_ops = { + .xmit = brcm_tag_xmit, + .rcv = brcm_tag_rcv, +}; diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index cacce1e22f9c..ce90c8bdc658 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -10,13 +10,12 @@ #include <linux/etherdevice.h> #include <linux/list.h> -#include <linux/netdevice.h> #include <linux/slab.h> #include "dsa_priv.h" #define DSA_HLEN 4 -netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *dsa_header; @@ -186,7 +185,7 @@ out: return 0; } -struct packet_type dsa_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_DSA), - .func = dsa_rcv, +const struct dsa_device_ops dsa_netdev_ops = { + .xmit = dsa_xmit, + .rcv = dsa_rcv, }; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index e70c43c25e64..94fcce778679 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -10,14 +10,13 @@ #include <linux/etherdevice.h> #include <linux/list.h> -#include <linux/netdevice.h> #include <linux/slab.h> #include "dsa_priv.h" #define DSA_HLEN 4 #define EDSA_HLEN 8 -netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *edsa_header; @@ -205,7 +204,7 @@ out: return 0; } -struct packet_type edsa_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_EDSA), - .func = edsa_rcv, +const struct dsa_device_ops edsa_netdev_ops = { + .xmit = edsa_xmit, + .rcv = edsa_rcv, }; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 94bc260d015d..115fdca34077 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -10,11 +10,10 @@ #include <linux/etherdevice.h> #include <linux/list.h> -#include <linux/netdevice.h> #include <linux/slab.h> #include "dsa_priv.h" -netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct sk_buff *nskb; @@ -114,7 +113,7 @@ out: return 0; } -struct packet_type trailer_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_TRAILER), - .func = trailer_rcv, +const struct dsa_device_ops trailer_netdev_ops = { + .xmit = trailer_xmit, + .rcv = trailer_rcv, }; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 5dc638cad2e1..33a140e15834 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -146,6 +146,33 @@ int eth_rebuild_header(struct sk_buff *skb) EXPORT_SYMBOL(eth_rebuild_header); /** + * eth_get_headlen - determine the the length of header for an ethernet frame + * @data: pointer to start of frame + * @len: total length of frame + * + * Make a best effort attempt to pull the length for all of the headers for + * a given frame in a linear buffer. + */ +u32 eth_get_headlen(void *data, unsigned int len) +{ + const struct ethhdr *eth = (const struct ethhdr *)data; + struct flow_keys keys; + + /* this should never happen, but better safe than sorry */ + if (len < sizeof(*eth)) + return len; + + /* parse any remaining L2/L3 headers, check for L4 */ + if (!__skb_flow_dissect(NULL, &keys, data, + eth->h_proto, sizeof(*eth), len)) + return max_t(u32, keys.thoff, sizeof(*eth)); + + /* parse for any L4 headers */ + return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); +} +EXPORT_SYMBOL(eth_get_headlen); + +/** * eth_type_trans - determine the packet's protocol ID. * @skb: received socket data * @dev: receiving network device @@ -181,11 +208,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. */ - if (unlikely(netdev_uses_dsa_tags(dev))) - return htons(ETH_P_DSA); - - if (unlikely(netdev_uses_trailer_tags(dev))) - return htons(ETH_P_TRAILER); + if (unlikely(netdev_uses_dsa(dev))) + return htons(ETH_P_XDSA); if (likely(ntohs(eth->h_proto) >= ETH_P_802_3_MIN)) return eth->h_proto; @@ -390,7 +414,8 @@ EXPORT_SYMBOL(ether_setup); struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs) { - return alloc_netdev_mqs(sizeof_priv, "eth%d", ether_setup, txqs, rxqs); + return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_UNKNOWN, + ether_setup, txqs, rxqs); } EXPORT_SYMBOL(alloc_etherdev_mqs); diff --git a/net/hsr/Makefile b/net/hsr/Makefile index b68359f181cc..9ae972a820f4 100644 --- a/net/hsr/Makefile +++ b/net/hsr/Makefile @@ -4,4 +4,5 @@ obj-$(CONFIG_HSR) += hsr.o -hsr-y := hsr_main.o hsr_framereg.o hsr_device.o hsr_netlink.o +hsr-y := hsr_main.o hsr_framereg.o hsr_device.o \ + hsr_netlink.o hsr_slave.o hsr_forward.o diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index e5302b7f7ca9..a138d75751df 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,7 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * This file contains device methods for creating, using and destroying * virtual HSR devices. @@ -15,12 +15,13 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> -#include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/pkt_sched.h> #include "hsr_device.h" +#include "hsr_slave.h" #include "hsr_framereg.h" #include "hsr_main.h" +#include "hsr_forward.h" static bool is_admin_up(struct net_device *dev) @@ -45,75 +46,108 @@ static void __hsr_set_operstate(struct net_device *dev, int transition) } } -void hsr_set_operstate(struct net_device *hsr_dev, struct net_device *slave1, - struct net_device *slave2) +static void hsr_set_operstate(struct hsr_port *master, bool has_carrier) { - if (!is_admin_up(hsr_dev)) { - __hsr_set_operstate(hsr_dev, IF_OPER_DOWN); + if (!is_admin_up(master->dev)) { + __hsr_set_operstate(master->dev, IF_OPER_DOWN); return; } - if (is_slave_up(slave1) || is_slave_up(slave2)) - __hsr_set_operstate(hsr_dev, IF_OPER_UP); + if (has_carrier) + __hsr_set_operstate(master->dev, IF_OPER_UP); else - __hsr_set_operstate(hsr_dev, IF_OPER_LOWERLAYERDOWN); + __hsr_set_operstate(master->dev, IF_OPER_LOWERLAYERDOWN); } -void hsr_set_carrier(struct net_device *hsr_dev, struct net_device *slave1, - struct net_device *slave2) +static bool hsr_check_carrier(struct hsr_port *master) { - if (is_slave_up(slave1) || is_slave_up(slave2)) - netif_carrier_on(hsr_dev); + struct hsr_port *port; + bool has_carrier; + + has_carrier = false; + + rcu_read_lock(); + hsr_for_each_port(master->hsr, port) + if ((port->type != HSR_PT_MASTER) && is_slave_up(port->dev)) { + has_carrier = true; + break; + } + rcu_read_unlock(); + + if (has_carrier) + netif_carrier_on(master->dev); else - netif_carrier_off(hsr_dev); + netif_carrier_off(master->dev); + + return has_carrier; } -void hsr_check_announce(struct net_device *hsr_dev, int old_operstate) +static void hsr_check_announce(struct net_device *hsr_dev, + unsigned char old_operstate) { - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; - hsr_priv = netdev_priv(hsr_dev); + hsr = netdev_priv(hsr_dev); if ((hsr_dev->operstate == IF_OPER_UP) && (old_operstate != IF_OPER_UP)) { /* Went up */ - hsr_priv->announce_count = 0; - hsr_priv->announce_timer.expires = jiffies + + hsr->announce_count = 0; + hsr->announce_timer.expires = jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); - add_timer(&hsr_priv->announce_timer); + add_timer(&hsr->announce_timer); } if ((hsr_dev->operstate != IF_OPER_UP) && (old_operstate == IF_OPER_UP)) /* Went down */ - del_timer(&hsr_priv->announce_timer); + del_timer(&hsr->announce_timer); } - -int hsr_get_max_mtu(struct hsr_priv *hsr_priv) +void hsr_check_carrier_and_operstate(struct hsr_priv *hsr) { - int mtu_max; - - if (hsr_priv->slave[0] && hsr_priv->slave[1]) - mtu_max = min(hsr_priv->slave[0]->mtu, hsr_priv->slave[1]->mtu); - else if (hsr_priv->slave[0]) - mtu_max = hsr_priv->slave[0]->mtu; - else if (hsr_priv->slave[1]) - mtu_max = hsr_priv->slave[1]->mtu; - else - mtu_max = HSR_TAGLEN; + struct hsr_port *master; + unsigned char old_operstate; + bool has_carrier; - return mtu_max - HSR_TAGLEN; + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + /* netif_stacked_transfer_operstate() cannot be used here since + * it doesn't set IF_OPER_LOWERLAYERDOWN (?) + */ + old_operstate = master->dev->operstate; + has_carrier = hsr_check_carrier(master); + hsr_set_operstate(master, has_carrier); + hsr_check_announce(master->dev, old_operstate); } +int hsr_get_max_mtu(struct hsr_priv *hsr) +{ + unsigned int mtu_max; + struct hsr_port *port; + + mtu_max = ETH_DATA_LEN; + rcu_read_lock(); + hsr_for_each_port(hsr, port) + if (port->type != HSR_PT_MASTER) + mtu_max = min(port->dev->mtu, mtu_max); + rcu_read_unlock(); + + if (mtu_max < HSR_HLEN) + return 0; + return mtu_max - HSR_HLEN; +} + + static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) { - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; + struct hsr_port *master; - hsr_priv = netdev_priv(dev); + hsr = netdev_priv(dev); + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); - if (new_mtu > hsr_get_max_mtu(hsr_priv)) { - netdev_info(hsr_priv->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", - HSR_TAGLEN); + if (new_mtu > hsr_get_max_mtu(hsr)) { + netdev_info(master->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", + HSR_HLEN); return -EINVAL; } @@ -124,164 +158,95 @@ static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) static int hsr_dev_open(struct net_device *dev) { - struct hsr_priv *hsr_priv; - int i; - char *slave_name; + struct hsr_priv *hsr; + struct hsr_port *port; + char designation; - hsr_priv = netdev_priv(dev); + hsr = netdev_priv(dev); + designation = '\0'; - for (i = 0; i < HSR_MAX_SLAVE; i++) { - if (hsr_priv->slave[i]) - slave_name = hsr_priv->slave[i]->name; - else - slave_name = "null"; - - if (!is_slave_up(hsr_priv->slave[i])) - netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a working HSR network\n", - 'A' + i, slave_name); + rcu_read_lock(); + hsr_for_each_port(hsr, port) { + if (port->type == HSR_PT_MASTER) + continue; + switch (port->type) { + case HSR_PT_SLAVE_A: + designation = 'A'; + break; + case HSR_PT_SLAVE_B: + designation = 'B'; + break; + default: + designation = '?'; + } + if (!is_slave_up(port->dev)) + netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a fully working HSR network\n", + designation, port->dev->name); } + rcu_read_unlock(); + + if (designation == '\0') + netdev_warn(dev, "No slave devices configured\n"); return 0; } + static int hsr_dev_close(struct net_device *dev) { - /* Nothing to do here. We could try to restore the state of the slaves - * to what they were before being changed by the hsr master dev's state, - * but they might have been changed manually in the mean time too, so - * taking them up or down here might be confusing and is probably not a - * good idea. - */ + /* Nothing to do here. */ return 0; } -static void hsr_fill_tag(struct hsr_ethhdr *hsr_ethhdr, struct hsr_priv *hsr_priv) +static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, + netdev_features_t features) { - unsigned long irqflags; + netdev_features_t mask; + struct hsr_port *port; - /* IEC 62439-1:2010, p 48, says the 4-bit "path" field can take values - * between 0001-1001 ("ring identifier", for regular HSR frames), - * or 1111 ("HSR management", supervision frames). Unfortunately, the - * spec writers forgot to explain what a "ring identifier" is, or - * how it is used. So we just set this to 0001 for regular frames, - * and 1111 for supervision frames. - */ - set_hsr_tag_path(&hsr_ethhdr->hsr_tag, 0x1); + mask = features; - /* IEC 62439-1:2010, p 12: "The link service data unit in an Ethernet - * frame is the content of the frame located between the Length/Type - * field and the Frame Check Sequence." + /* Mask out all features that, if supported by one device, should be + * enabled for all devices (see NETIF_F_ONE_FOR_ALL). * - * IEC 62439-3, p 48, specifies the "original LPDU" to include the - * original "LT" field (what "LT" means is not explained anywhere as - * far as I can see - perhaps "Length/Type"?). So LSDU_size might - * equal original length + 2. - * Also, the fact that this field is not used anywhere (might be used - * by a RedBox connecting HSR and PRP nets?) means I cannot test its - * correctness. Instead of guessing, I set this to 0 here, to make any - * problems immediately apparent. Anyone using this driver with PRP/HSR - * RedBoxes might need to fix this... + * Anything that's off in mask will not be enabled - so only things + * that were in features originally, and also is in NETIF_F_ONE_FOR_ALL, + * may become enabled. */ - set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, 0); - - spin_lock_irqsave(&hsr_priv->seqnr_lock, irqflags); - hsr_ethhdr->hsr_tag.sequence_nr = htons(hsr_priv->sequence_nr); - hsr_priv->sequence_nr++; - spin_unlock_irqrestore(&hsr_priv->seqnr_lock, irqflags); + features &= ~NETIF_F_ONE_FOR_ALL; + hsr_for_each_port(hsr, port) + features = netdev_increment_features(features, + port->dev->features, + mask); - hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; - - hsr_ethhdr->ethhdr.h_proto = htons(ETH_P_PRP); + return features; } -static int slave_xmit(struct sk_buff *skb, struct hsr_priv *hsr_priv, - enum hsr_dev_idx dev_idx) +static netdev_features_t hsr_fix_features(struct net_device *dev, + netdev_features_t features) { - struct hsr_ethhdr *hsr_ethhdr; - - hsr_ethhdr = (struct hsr_ethhdr *) skb->data; + struct hsr_priv *hsr = netdev_priv(dev); - skb->dev = hsr_priv->slave[dev_idx]; - - hsr_addr_subst_dest(hsr_priv, &hsr_ethhdr->ethhdr, dev_idx); - - /* Address substitution (IEC62439-3 pp 26, 50): replace mac - * address of outgoing frame with that of the outgoing slave's. - */ - ether_addr_copy(hsr_ethhdr->ethhdr.h_source, skb->dev->dev_addr); - - return dev_queue_xmit(skb); + return hsr_features_recompute(hsr, features); } static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) { - struct hsr_priv *hsr_priv; - struct hsr_ethhdr *hsr_ethhdr; - struct sk_buff *skb2; - int res1, res2; - - hsr_priv = netdev_priv(dev); - hsr_ethhdr = (struct hsr_ethhdr *) skb->data; - - if ((skb->protocol != htons(ETH_P_PRP)) || - (hsr_ethhdr->ethhdr.h_proto != htons(ETH_P_PRP))) { - hsr_fill_tag(hsr_ethhdr, hsr_priv); - skb->protocol = htons(ETH_P_PRP); - } - - skb2 = pskb_copy(skb, GFP_ATOMIC); - - res1 = NET_XMIT_DROP; - if (likely(hsr_priv->slave[HSR_DEV_SLAVE_A])) - res1 = slave_xmit(skb, hsr_priv, HSR_DEV_SLAVE_A); + struct hsr_priv *hsr = netdev_priv(dev); + struct hsr_port *master; - res2 = NET_XMIT_DROP; - if (likely(skb2 && hsr_priv->slave[HSR_DEV_SLAVE_B])) - res2 = slave_xmit(skb2, hsr_priv, HSR_DEV_SLAVE_B); - - if (likely(res1 == NET_XMIT_SUCCESS || res1 == NET_XMIT_CN || - res2 == NET_XMIT_SUCCESS || res2 == NET_XMIT_CN)) { - hsr_priv->dev->stats.tx_packets++; - hsr_priv->dev->stats.tx_bytes += skb->len; - } else { - hsr_priv->dev->stats.tx_dropped++; - } + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + skb->dev = master->dev; + hsr_forward_skb(skb, master); return NETDEV_TX_OK; } -static int hsr_header_create(struct sk_buff *skb, struct net_device *dev, - unsigned short type, const void *daddr, - const void *saddr, unsigned int len) -{ - int res; - - /* Make room for the HSR tag now. We will fill it in later (in - * hsr_dev_xmit) - */ - if (skb_headroom(skb) < HSR_TAGLEN + ETH_HLEN) - return -ENOBUFS; - skb_push(skb, HSR_TAGLEN); - - /* To allow VLAN/HSR combos we should probably use - * res = dev_hard_header(skb, dev, type, daddr, saddr, len + HSR_TAGLEN); - * here instead. It would require other changes too, though - e.g. - * separate headers for each slave etc... - */ - res = eth_header(skb, dev, type, daddr, saddr, len + HSR_TAGLEN); - if (res <= 0) - return res; - skb_reset_mac_header(skb); - - return res + HSR_TAGLEN; -} - - static const struct header_ops hsr_header_ops = { - .create = hsr_header_create, + .create = eth_header, .parse = eth_header_parse, }; @@ -291,67 +256,63 @@ static const struct header_ops hsr_header_ops = { */ static int hsr_pad(int size) { - const int min_size = ETH_ZLEN - HSR_TAGLEN - ETH_HLEN; + const int min_size = ETH_ZLEN - HSR_HLEN - ETH_HLEN; if (size >= min_size) return size; return min_size; } -static void send_hsr_supervision_frame(struct net_device *hsr_dev, u8 type) +static void send_hsr_supervision_frame(struct hsr_port *master, u8 type) { - struct hsr_priv *hsr_priv; struct sk_buff *skb; int hlen, tlen; struct hsr_sup_tag *hsr_stag; struct hsr_sup_payload *hsr_sp; unsigned long irqflags; - hlen = LL_RESERVED_SPACE(hsr_dev); - tlen = hsr_dev->needed_tailroom; + hlen = LL_RESERVED_SPACE(master->dev); + tlen = master->dev->needed_tailroom; skb = alloc_skb(hsr_pad(sizeof(struct hsr_sup_payload)) + hlen + tlen, GFP_ATOMIC); if (skb == NULL) return; - hsr_priv = netdev_priv(hsr_dev); - skb_reserve(skb, hlen); - skb->dev = hsr_dev; + skb->dev = master->dev; skb->protocol = htons(ETH_P_PRP); skb->priority = TC_PRIO_CONTROL; if (dev_hard_header(skb, skb->dev, ETH_P_PRP, - hsr_priv->sup_multicast_addr, - skb->dev->dev_addr, skb->len) < 0) + master->hsr->sup_multicast_addr, + skb->dev->dev_addr, skb->len) <= 0) goto out; + skb_reset_mac_header(skb); - skb_pull(skb, sizeof(struct ethhdr)); - hsr_stag = (typeof(hsr_stag)) skb->data; + hsr_stag = (typeof(hsr_stag)) skb_put(skb, sizeof(*hsr_stag)); set_hsr_stag_path(hsr_stag, 0xf); set_hsr_stag_HSR_Ver(hsr_stag, 0); - spin_lock_irqsave(&hsr_priv->seqnr_lock, irqflags); - hsr_stag->sequence_nr = htons(hsr_priv->sequence_nr); - hsr_priv->sequence_nr++; - spin_unlock_irqrestore(&hsr_priv->seqnr_lock, irqflags); + spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags); + hsr_stag->sequence_nr = htons(master->hsr->sequence_nr); + master->hsr->sequence_nr++; + spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); hsr_stag->HSR_TLV_Type = type; hsr_stag->HSR_TLV_Length = 12; - skb_push(skb, sizeof(struct ethhdr)); - /* Payload: MacAddressA */ hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(*hsr_sp)); - ether_addr_copy(hsr_sp->MacAddressA, hsr_dev->dev_addr); + ether_addr_copy(hsr_sp->MacAddressA, master->dev->dev_addr); - dev_queue_xmit(skb); + hsr_forward_skb(skb, master); return; out: + WARN_ON_ONCE("HSR: Could not send supervision frame\n"); kfree_skb(skb); } @@ -360,59 +321,32 @@ out: */ static void hsr_announce(unsigned long data) { - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; + struct hsr_port *master; - hsr_priv = (struct hsr_priv *) data; + hsr = (struct hsr_priv *) data; - if (hsr_priv->announce_count < 3) { - send_hsr_supervision_frame(hsr_priv->dev, HSR_TLV_ANNOUNCE); - hsr_priv->announce_count++; + rcu_read_lock(); + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + + if (hsr->announce_count < 3) { + send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE); + hsr->announce_count++; } else { - send_hsr_supervision_frame(hsr_priv->dev, HSR_TLV_LIFE_CHECK); + send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK); } - if (hsr_priv->announce_count < 3) - hsr_priv->announce_timer.expires = jiffies + + if (hsr->announce_count < 3) + hsr->announce_timer.expires = jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); else - hsr_priv->announce_timer.expires = jiffies + + hsr->announce_timer.expires = jiffies + msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); - if (is_admin_up(hsr_priv->dev)) - add_timer(&hsr_priv->announce_timer); -} - - -static void restore_slaves(struct net_device *hsr_dev) -{ - struct hsr_priv *hsr_priv; - int i; - int res; - - hsr_priv = netdev_priv(hsr_dev); - - rtnl_lock(); - - /* Restore promiscuity */ - for (i = 0; i < HSR_MAX_SLAVE; i++) { - if (!hsr_priv->slave[i]) - continue; - res = dev_set_promiscuity(hsr_priv->slave[i], -1); - if (res) - netdev_info(hsr_dev, - "Cannot restore slave promiscuity (%s, %d)\n", - hsr_priv->slave[i]->name, res); - } - - rtnl_unlock(); -} - -static void reclaim_hsr_dev(struct rcu_head *rh) -{ - struct hsr_priv *hsr_priv; + if (is_admin_up(master->dev)) + add_timer(&hsr->announce_timer); - hsr_priv = container_of(rh, struct hsr_priv, rcu_head); - free_netdev(hsr_priv->dev); + rcu_read_unlock(); } @@ -421,14 +355,18 @@ static void reclaim_hsr_dev(struct rcu_head *rh) */ static void hsr_dev_destroy(struct net_device *hsr_dev) { - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; + struct hsr_port *port; - hsr_priv = netdev_priv(hsr_dev); + hsr = netdev_priv(hsr_dev); + hsr_for_each_port(hsr, port) + hsr_del_port(port); - del_timer(&hsr_priv->announce_timer); - unregister_hsr_master(hsr_priv); /* calls list_del_rcu on hsr_priv */ - restore_slaves(hsr_dev); - call_rcu(&hsr_priv->rcu_head, reclaim_hsr_dev); /* reclaim hsr_priv */ + del_timer_sync(&hsr->prune_timer); + del_timer_sync(&hsr->announce_timer); + + synchronize_rcu(); + free_netdev(hsr_dev); } static const struct net_device_ops hsr_device_ops = { @@ -436,62 +374,51 @@ static const struct net_device_ops hsr_device_ops = { .ndo_open = hsr_dev_open, .ndo_stop = hsr_dev_close, .ndo_start_xmit = hsr_dev_xmit, + .ndo_fix_features = hsr_fix_features, }; +static struct device_type hsr_type = { + .name = "hsr", +}; void hsr_dev_setup(struct net_device *dev) { random_ether_addr(dev->dev_addr); ether_setup(dev); - dev->header_ops = &hsr_header_ops; - dev->netdev_ops = &hsr_device_ops; - dev->tx_queue_len = 0; + dev->header_ops = &hsr_header_ops; + dev->netdev_ops = &hsr_device_ops; + SET_NETDEV_DEVTYPE(dev, &hsr_type); + dev->tx_queue_len = 0; dev->destructor = hsr_dev_destroy; + + dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | + NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | + NETIF_F_HW_VLAN_CTAG_TX; + + dev->features = dev->hw_features; + + /* Prevent recursive tx locking */ + dev->features |= NETIF_F_LLTX; + /* VLAN on top of HSR needs testing and probably some work on + * hsr_header_create() etc. + */ + dev->features |= NETIF_F_VLAN_CHALLENGED; + /* Not sure about this. Taken from bridge code. netdev_features.h says + * it means "Does not change network namespaces". + */ + dev->features |= NETIF_F_NETNS_LOCAL; } /* Return true if dev is a HSR master; return false otherwise. */ -bool is_hsr_master(struct net_device *dev) +inline bool is_hsr_master(struct net_device *dev) { return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit); } -static int check_slave_ok(struct net_device *dev) -{ - /* Don't allow HSR on non-ethernet like devices */ - if ((dev->flags & IFF_LOOPBACK) || (dev->type != ARPHRD_ETHER) || - (dev->addr_len != ETH_ALEN)) { - netdev_info(dev, "Cannot use loopback or non-ethernet device as HSR slave.\n"); - return -EINVAL; - } - - /* Don't allow enslaving hsr devices */ - if (is_hsr_master(dev)) { - netdev_info(dev, "Cannot create trees of HSR devices.\n"); - return -EINVAL; - } - - if (is_hsr_slave(dev)) { - netdev_info(dev, "This device is already a HSR slave.\n"); - return -EINVAL; - } - - if (dev->priv_flags & IFF_802_1Q_VLAN) { - netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n"); - return -EINVAL; - } - - /* HSR over bonded devices has not been tested, but I'm not sure it - * won't work... - */ - - return 0; -} - - /* Default multicast address for HSR Supervision frames */ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00 @@ -500,97 +427,74 @@ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], unsigned char multicast_spec) { - struct hsr_priv *hsr_priv; - int i; + struct hsr_priv *hsr; + struct hsr_port *port; int res; - hsr_priv = netdev_priv(hsr_dev); - hsr_priv->dev = hsr_dev; - INIT_LIST_HEAD(&hsr_priv->node_db); - INIT_LIST_HEAD(&hsr_priv->self_node_db); - for (i = 0; i < HSR_MAX_SLAVE; i++) - hsr_priv->slave[i] = slave[i]; - - spin_lock_init(&hsr_priv->seqnr_lock); - /* Overflow soon to find bugs easier: */ - hsr_priv->sequence_nr = USHRT_MAX - 1024; - - init_timer(&hsr_priv->announce_timer); - hsr_priv->announce_timer.function = hsr_announce; - hsr_priv->announce_timer.data = (unsigned long) hsr_priv; + hsr = netdev_priv(hsr_dev); + INIT_LIST_HEAD(&hsr->ports); + INIT_LIST_HEAD(&hsr->node_db); + INIT_LIST_HEAD(&hsr->self_node_db); - ether_addr_copy(hsr_priv->sup_multicast_addr, def_multicast_addr); - hsr_priv->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; + ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); -/* FIXME: should I modify the value of these? - * - * - hsr_dev->flags - i.e. - * IFF_MASTER/SLAVE? - * - hsr_dev->priv_flags - i.e. - * IFF_EBRIDGE? - * IFF_TX_SKB_SHARING? - * IFF_HSR_MASTER/SLAVE? - */ + /* Make sure we recognize frames from ourselves in hsr_rcv() */ + res = hsr_create_self_node(&hsr->self_node_db, hsr_dev->dev_addr, + slave[1]->dev_addr); + if (res < 0) + return res; - for (i = 0; i < HSR_MAX_SLAVE; i++) { - res = check_slave_ok(slave[i]); - if (res) - return res; - } + spin_lock_init(&hsr->seqnr_lock); + /* Overflow soon to find bugs easier: */ + hsr->sequence_nr = HSR_SEQNR_START; - hsr_dev->features = slave[0]->features & slave[1]->features; - /* Prevent recursive tx locking */ - hsr_dev->features |= NETIF_F_LLTX; - /* VLAN on top of HSR needs testing and probably some work on - * hsr_header_create() etc. - */ - hsr_dev->features |= NETIF_F_VLAN_CHALLENGED; + init_timer(&hsr->announce_timer); + hsr->announce_timer.function = hsr_announce; + hsr->announce_timer.data = (unsigned long) hsr; - /* Set hsr_dev's MAC address to that of mac_slave1 */ - ether_addr_copy(hsr_dev->dev_addr, hsr_priv->slave[0]->dev_addr); + init_timer(&hsr->prune_timer); + hsr->prune_timer.function = hsr_prune_nodes; + hsr->prune_timer.data = (unsigned long) hsr; - /* Set required header length */ - for (i = 0; i < HSR_MAX_SLAVE; i++) { - if (slave[i]->hard_header_len + HSR_TAGLEN > - hsr_dev->hard_header_len) - hsr_dev->hard_header_len = - slave[i]->hard_header_len + HSR_TAGLEN; - } + ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr); + hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; - /* MTU */ - for (i = 0; i < HSR_MAX_SLAVE; i++) - if (slave[i]->mtu - HSR_TAGLEN < hsr_dev->mtu) - hsr_dev->mtu = slave[i]->mtu - HSR_TAGLEN; + /* FIXME: should I modify the value of these? + * + * - hsr_dev->flags - i.e. + * IFF_MASTER/SLAVE? + * - hsr_dev->priv_flags - i.e. + * IFF_EBRIDGE? + * IFF_TX_SKB_SHARING? + * IFF_HSR_MASTER/SLAVE? + */ /* Make sure the 1st call to netif_carrier_on() gets through */ netif_carrier_off(hsr_dev); - /* Promiscuity */ - for (i = 0; i < HSR_MAX_SLAVE; i++) { - res = dev_set_promiscuity(slave[i], 1); - if (res) { - netdev_info(hsr_dev, "Cannot set slave promiscuity (%s, %d)\n", - slave[i]->name, res); - goto fail; - } - } + res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER); + if (res) + return res; - /* Make sure we recognize frames from ourselves in hsr_rcv() */ - res = hsr_create_self_node(&hsr_priv->self_node_db, - hsr_dev->dev_addr, - hsr_priv->slave[1]->dev_addr); - if (res < 0) + res = register_netdevice(hsr_dev); + if (res) goto fail; - res = register_netdevice(hsr_dev); + res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A); + if (res) + goto fail; + res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B); if (res) goto fail; - register_hsr_master(hsr_priv); + hsr->prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD); + add_timer(&hsr->prune_timer); return 0; fail: - restore_slaves(hsr_dev); + hsr_for_each_port(hsr, port) + hsr_del_port(port); + return res; } diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index 2c7148e73914..108a5d59d2a6 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,7 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ #ifndef __HSR_DEVICE_H @@ -18,12 +18,8 @@ void hsr_dev_setup(struct net_device *dev); int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], unsigned char multicast_spec); -void hsr_set_operstate(struct net_device *hsr_dev, struct net_device *slave1, - struct net_device *slave2); -void hsr_set_carrier(struct net_device *hsr_dev, struct net_device *slave1, - struct net_device *slave2); -void hsr_check_announce(struct net_device *hsr_dev, int old_operstate); +void hsr_check_carrier_and_operstate(struct hsr_priv *hsr); bool is_hsr_master(struct net_device *dev); -int hsr_get_max_mtu(struct hsr_priv *hsr_priv); +int hsr_get_max_mtu(struct hsr_priv *hsr); #endif /* __HSR_DEVICE_H */ diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c new file mode 100644 index 000000000000..7871ed6d3825 --- /dev/null +++ b/net/hsr/hsr_forward.c @@ -0,0 +1,368 @@ +/* Copyright 2011-2014 Autronica Fire and Security AS + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Author(s): + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + */ + +#include "hsr_forward.h" +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/etherdevice.h> +#include <linux/if_vlan.h> +#include "hsr_main.h" +#include "hsr_framereg.h" + + +struct hsr_node; + +struct hsr_frame_info { + struct sk_buff *skb_std; + struct sk_buff *skb_hsr; + struct hsr_port *port_rcv; + struct hsr_node *node_src; + u16 sequence_nr; + bool is_supervision; + bool is_vlan; + bool is_local_dest; + bool is_local_exclusive; +}; + + +/* The uses I can see for these HSR supervision frames are: + * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type = + * 22") to reset any sequence_nr counters belonging to that node. Useful if + * the other node's counter has been reset for some reason. + * -- + * Or not - resetting the counter and bridging the frame would create a + * loop, unfortunately. + * + * 2) Use the LifeCheck frames to detect ring breaks. I.e. if no LifeCheck + * frame is received from a particular node, we know something is wrong. + * We just register these (as with normal frames) and throw them away. + * + * 3) Allow different MAC addresses for the two slave interfaces, using the + * MacAddressA field. + */ +static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) +{ + struct hsr_ethhdr_sp *hdr; + + WARN_ON_ONCE(!skb_mac_header_was_set(skb)); + hdr = (struct hsr_ethhdr_sp *) skb_mac_header(skb); + + if (!ether_addr_equal(hdr->ethhdr.h_dest, + hsr->sup_multicast_addr)) + return false; + + if (get_hsr_stag_path(&hdr->hsr_sup) != 0x0f) + return false; + if ((hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_ANNOUNCE) && + (hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_LIFE_CHECK)) + return false; + if (hdr->hsr_sup.HSR_TLV_Length != 12) + return false; + + return true; +} + + +static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in, + struct hsr_frame_info *frame) +{ + struct sk_buff *skb; + int copylen; + unsigned char *dst, *src; + + skb_pull(skb_in, HSR_HLEN); + skb = __pskb_copy(skb_in, skb_headroom(skb_in) - HSR_HLEN, GFP_ATOMIC); + skb_push(skb_in, HSR_HLEN); + if (skb == NULL) + return NULL; + + skb_reset_mac_header(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start -= HSR_HLEN; + + copylen = 2*ETH_ALEN; + if (frame->is_vlan) + copylen += VLAN_HLEN; + src = skb_mac_header(skb_in); + dst = skb_mac_header(skb); + memcpy(dst, src, copylen); + + skb->protocol = eth_hdr(skb)->h_proto; + return skb; +} + +static struct sk_buff *frame_get_stripped_skb(struct hsr_frame_info *frame, + struct hsr_port *port) +{ + if (!frame->skb_std) + frame->skb_std = create_stripped_skb(frame->skb_hsr, frame); + return skb_clone(frame->skb_std, GFP_ATOMIC); +} + + +static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, + struct hsr_port *port) +{ + struct hsr_ethhdr *hsr_ethhdr; + int lane_id; + int lsdu_size; + + if (port->type == HSR_PT_SLAVE_A) + lane_id = 0; + else + lane_id = 1; + + lsdu_size = skb->len - 14; + if (frame->is_vlan) + lsdu_size -= 4; + + hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb); + + set_hsr_tag_path(&hsr_ethhdr->hsr_tag, lane_id); + set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); + hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); + hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; + hsr_ethhdr->ethhdr.h_proto = htons(ETH_P_PRP); +} + +static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, + struct hsr_frame_info *frame, + struct hsr_port *port) +{ + int movelen; + unsigned char *dst, *src; + struct sk_buff *skb; + + /* Create the new skb with enough headroom to fit the HSR tag */ + skb = __pskb_copy(skb_o, skb_headroom(skb_o) + HSR_HLEN, GFP_ATOMIC); + if (skb == NULL) + return NULL; + skb_reset_mac_header(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start += HSR_HLEN; + + movelen = ETH_HLEN; + if (frame->is_vlan) + movelen += VLAN_HLEN; + + src = skb_mac_header(skb); + dst = skb_push(skb, HSR_HLEN); + memmove(dst, src, movelen); + skb_reset_mac_header(skb); + + hsr_fill_tag(skb, frame, port); + + return skb; +} + +/* If the original frame was an HSR tagged frame, just clone it to be sent + * unchanged. Otherwise, create a private frame especially tagged for 'port'. + */ +static struct sk_buff *frame_get_tagged_skb(struct hsr_frame_info *frame, + struct hsr_port *port) +{ + if (frame->skb_hsr) + return skb_clone(frame->skb_hsr, GFP_ATOMIC); + + if ((port->type != HSR_PT_SLAVE_A) && (port->type != HSR_PT_SLAVE_B)) { + WARN_ONCE(1, "HSR: Bug: trying to create a tagged frame for a non-ring port"); + return NULL; + } + + return create_tagged_skb(frame->skb_std, frame, port); +} + + +static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, + struct hsr_node *node_src) +{ + bool was_multicast_frame; + int res; + + was_multicast_frame = (skb->pkt_type == PACKET_MULTICAST); + hsr_addr_subst_source(node_src, skb); + skb_pull(skb, ETH_HLEN); + res = netif_rx(skb); + if (res == NET_RX_DROP) { + dev->stats.rx_dropped++; + } else { + dev->stats.rx_packets++; + dev->stats.rx_bytes += skb->len; + if (was_multicast_frame) + dev->stats.multicast++; + } +} + +static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port, + struct hsr_frame_info *frame) +{ + if (frame->port_rcv->type == HSR_PT_MASTER) { + hsr_addr_subst_dest(frame->node_src, skb, port); + + /* Address substitution (IEC62439-3 pp 26, 50): replace mac + * address of outgoing frame with that of the outgoing slave's. + */ + ether_addr_copy(eth_hdr(skb)->h_source, port->dev->dev_addr); + } + return dev_queue_xmit(skb); +} + + +/* Forward the frame through all devices except: + * - Back through the receiving device + * - If it's a HSR frame: through a device where it has passed before + * - To the local HSR master only if the frame is directly addressed to it, or + * a non-supervision multicast or broadcast frame. + * + * HSR slave devices should insert a HSR tag into the frame, or forward the + * frame unchanged if it's already tagged. Interlink devices should strip HSR + * tags if they're of the non-HSR type (but only after duplicate discard). The + * master device always strips HSR tags. + */ +static void hsr_forward_do(struct hsr_frame_info *frame) +{ + struct hsr_port *port; + struct sk_buff *skb; + + hsr_for_each_port(frame->port_rcv->hsr, port) { + /* Don't send frame back the way it came */ + if (port == frame->port_rcv) + continue; + + /* Don't deliver locally unless we should */ + if ((port->type == HSR_PT_MASTER) && !frame->is_local_dest) + continue; + + /* Deliver frames directly addressed to us to master only */ + if ((port->type != HSR_PT_MASTER) && frame->is_local_exclusive) + continue; + + /* Don't send frame over port where it has been sent before */ + if (hsr_register_frame_out(port, frame->node_src, + frame->sequence_nr)) + continue; + + if (frame->is_supervision && (port->type == HSR_PT_MASTER)) { + hsr_handle_sup_frame(frame->skb_hsr, + frame->node_src, + frame->port_rcv); + continue; + } + + if (port->type != HSR_PT_MASTER) + skb = frame_get_tagged_skb(frame, port); + else + skb = frame_get_stripped_skb(frame, port); + if (skb == NULL) { + /* FIXME: Record the dropped frame? */ + continue; + } + + skb->dev = port->dev; + if (port->type == HSR_PT_MASTER) + hsr_deliver_master(skb, port->dev, frame->node_src); + else + hsr_xmit(skb, port, frame); + } +} + + +static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb, + struct hsr_frame_info *frame) +{ + struct net_device *master_dev; + + master_dev = hsr_port_get_hsr(hsr, HSR_PT_MASTER)->dev; + + if (hsr_addr_is_self(hsr, eth_hdr(skb)->h_dest)) { + frame->is_local_exclusive = true; + skb->pkt_type = PACKET_HOST; + } else { + frame->is_local_exclusive = false; + } + + if ((skb->pkt_type == PACKET_HOST) || + (skb->pkt_type == PACKET_MULTICAST) || + (skb->pkt_type == PACKET_BROADCAST)) { + frame->is_local_dest = true; + } else { + frame->is_local_dest = false; + } +} + + +static int hsr_fill_frame_info(struct hsr_frame_info *frame, + struct sk_buff *skb, struct hsr_port *port) +{ + struct ethhdr *ethhdr; + unsigned long irqflags; + + frame->is_supervision = is_supervision_frame(port->hsr, skb); + frame->node_src = hsr_get_node(&port->hsr->node_db, skb, + frame->is_supervision); + if (frame->node_src == NULL) + return -1; /* Unknown node and !is_supervision, or no mem */ + + ethhdr = (struct ethhdr *) skb_mac_header(skb); + frame->is_vlan = false; + if (ethhdr->h_proto == htons(ETH_P_8021Q)) { + frame->is_vlan = true; + /* FIXME: */ + WARN_ONCE(1, "HSR: VLAN not yet supported"); + } + if (ethhdr->h_proto == htons(ETH_P_PRP)) { + frame->skb_std = NULL; + frame->skb_hsr = skb; + frame->sequence_nr = hsr_get_skb_sequence_nr(skb); + } else { + frame->skb_std = skb; + frame->skb_hsr = NULL; + /* Sequence nr for the master node */ + spin_lock_irqsave(&port->hsr->seqnr_lock, irqflags); + frame->sequence_nr = port->hsr->sequence_nr; + port->hsr->sequence_nr++; + spin_unlock_irqrestore(&port->hsr->seqnr_lock, irqflags); + } + + frame->port_rcv = port; + check_local_dest(port->hsr, skb, frame); + + return 0; +} + +/* Must be called holding rcu read lock (because of the port parameter) */ +void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) +{ + struct hsr_frame_info frame; + + if (skb_mac_header(skb) != skb->data) { + WARN_ONCE(1, "%s:%d: Malformed frame (port_src %s)\n", + __FILE__, __LINE__, port->dev->name); + goto out_drop; + } + + if (hsr_fill_frame_info(&frame, skb, port) < 0) + goto out_drop; + hsr_register_frame_in(frame.node_src, port, frame.sequence_nr); + hsr_forward_do(&frame); + + if (frame.skb_hsr != NULL) + kfree_skb(frame.skb_hsr); + if (frame.skb_std != NULL) + kfree_skb(frame.skb_std); + return; + +out_drop: + port->dev->stats.tx_dropped++; + kfree_skb(skb); +} diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h new file mode 100644 index 000000000000..5c5bc4b6b75f --- /dev/null +++ b/net/hsr/hsr_forward.h @@ -0,0 +1,20 @@ +/* Copyright 2011-2014 Autronica Fire and Security AS + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Author(s): + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + */ + +#ifndef __HSR_FORWARD_H +#define __HSR_FORWARD_H + +#include <linux/netdevice.h> +#include "hsr_main.h" + +void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port); + +#endif /* __HSR_FORWARD_H */ diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 83e58449366a..bace124d14ef 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,7 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * The HSR spec says never to forward the same frame twice on the same * interface. A frame is identified by its source MAC address and its HSR @@ -23,71 +23,68 @@ #include "hsr_netlink.h" -struct node_entry { - struct list_head mac_list; - unsigned char MacAddressA[ETH_ALEN]; - unsigned char MacAddressB[ETH_ALEN]; - enum hsr_dev_idx AddrB_if; /* The local slave through which AddrB - * frames are received from this node - */ - unsigned long time_in[HSR_MAX_SLAVE]; - bool time_in_stale[HSR_MAX_SLAVE]; - u16 seq_out[HSR_MAX_DEV]; - struct rcu_head rcu_head; +struct hsr_node { + struct list_head mac_list; + unsigned char MacAddressA[ETH_ALEN]; + unsigned char MacAddressB[ETH_ALEN]; + /* Local slave through which AddrB frames are received from this node */ + enum hsr_port_type AddrB_port; + unsigned long time_in[HSR_PT_PORTS]; + bool time_in_stale[HSR_PT_PORTS]; + u16 seq_out[HSR_PT_PORTS]; + struct rcu_head rcu_head; }; -/* TODO: use hash lists for mac addresses (linux/jhash.h)? */ +/* TODO: use hash lists for mac addresses (linux/jhash.h)? */ -/* Search for mac entry. Caller must hold rcu read lock. +/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, + * false otherwise. */ -static struct node_entry *find_node_by_AddrA(struct list_head *node_db, - const unsigned char addr[ETH_ALEN]) +static bool seq_nr_after(u16 a, u16 b) { - struct node_entry *node; - - list_for_each_entry_rcu(node, node_db, mac_list) { - if (ether_addr_equal(node->MacAddressA, addr)) - return node; - } + /* Remove inconsistency where + * seq_nr_after(a, b) == seq_nr_before(a, b) + */ + if ((int) b - a == 32768) + return false; - return NULL; + return (((s16) (b - a)) < 0); } +#define seq_nr_before(a, b) seq_nr_after((b), (a)) +#define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b))) +#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) -/* Search for mac entry. Caller must hold rcu read lock. - */ -static struct node_entry *find_node_by_AddrB(struct list_head *node_db, - const unsigned char addr[ETH_ALEN]) +bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { - struct node_entry *node; + struct hsr_node *node; - list_for_each_entry_rcu(node, node_db, mac_list) { - if (ether_addr_equal(node->MacAddressB, addr)) - return node; + node = list_first_or_null_rcu(&hsr->self_node_db, struct hsr_node, + mac_list); + if (!node) { + WARN_ONCE(1, "HSR: No self node\n"); + return false; } - return NULL; -} + if (ether_addr_equal(addr, node->MacAddressA)) + return true; + if (ether_addr_equal(addr, node->MacAddressB)) + return true; + return false; +} /* Search for mac entry. Caller must hold rcu read lock. */ -struct node_entry *hsr_find_node(struct list_head *node_db, struct sk_buff *skb) +static struct hsr_node *find_node_by_AddrA(struct list_head *node_db, + const unsigned char addr[ETH_ALEN]) { - struct node_entry *node; - struct ethhdr *ethhdr; - - if (!skb_mac_header_was_set(skb)) - return NULL; - - ethhdr = (struct ethhdr *) skb_mac_header(skb); + struct hsr_node *node; list_for_each_entry_rcu(node, node_db, mac_list) { - if (ether_addr_equal(node->MacAddressA, ethhdr->h_source)) - return node; - if (ether_addr_equal(node->MacAddressB, ethhdr->h_source)) + if (ether_addr_equal(node->MacAddressA, addr)) return node; } @@ -102,7 +99,7 @@ int hsr_create_self_node(struct list_head *self_node_db, unsigned char addr_a[ETH_ALEN], unsigned char addr_b[ETH_ALEN]) { - struct node_entry *node, *oldnode; + struct hsr_node *node, *oldnode; node = kmalloc(sizeof(*node), GFP_KERNEL); if (!node) @@ -113,7 +110,7 @@ int hsr_create_self_node(struct list_head *self_node_db, rcu_read_lock(); oldnode = list_first_or_null_rcu(self_node_db, - struct node_entry, mac_list); + struct hsr_node, mac_list); if (oldnode) { list_replace_rcu(&oldnode->mac_list, &node->mac_list); rcu_read_unlock(); @@ -128,135 +125,144 @@ int hsr_create_self_node(struct list_head *self_node_db, } -/* Add/merge node to the database of nodes. 'skb' must contain an HSR - * supervision frame. - * - If the supervision header's MacAddressA field is not yet in the database, - * this frame is from an hitherto unknown node - add it to the database. - * - If the sender's MAC address is not the same as its MacAddressA address, - * the node is using PICS_SUBS (address substitution). Record the sender's - * address as the node's MacAddressB. - * - * This function needs to work even if the sender node has changed one of its - * slaves' MAC addresses. In this case, there are four different cases described - * by (Addr-changed, received-from) pairs as follows. Note that changing the - * SlaveA address is equal to changing the node's own address: - * - * - (AddrB, SlaveB): The new AddrB will be recorded by PICS_SUBS code since - * node == NULL. - * - (AddrB, SlaveA): Will work as usual (the AddrB change won't be detected - * from this frame). - * - * - (AddrA, SlaveB): The old node will be found. We need to detect this and - * remove the node. - * - (AddrA, SlaveA): A new node will be registered (non-PICS_SUBS at first). - * The old one will be pruned after HSR_NODE_FORGET_TIME. - * - * We also need to detect if the sender's SlaveA and SlaveB cables have been - * swapped. +/* Allocate an hsr_node and add it to node_db. 'addr' is the node's AddressA; + * seq_out is used to initialize filtering of outgoing duplicate frames + * originating from the newly added node. */ -struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, - struct node_entry *node, - struct sk_buff *skb, - enum hsr_dev_idx dev_idx) +struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], + u16 seq_out) { - struct hsr_sup_payload *hsr_sp; - struct hsr_ethhdr_sp *hsr_ethsup; - int i; + struct hsr_node *node; unsigned long now; - - hsr_ethsup = (struct hsr_ethhdr_sp *) skb_mac_header(skb); - hsr_sp = (struct hsr_sup_payload *) skb->data; - - if (node && !ether_addr_equal(node->MacAddressA, hsr_sp->MacAddressA)) { - /* Node has changed its AddrA, frame was received from SlaveB */ - list_del_rcu(&node->mac_list); - kfree_rcu(node, rcu_head); - node = NULL; - } - - if (node && (dev_idx == node->AddrB_if) && - !ether_addr_equal(node->MacAddressB, hsr_ethsup->ethhdr.h_source)) { - /* Cables have been swapped */ - list_del_rcu(&node->mac_list); - kfree_rcu(node, rcu_head); - node = NULL; - } - - if (node && (dev_idx != node->AddrB_if) && - (node->AddrB_if != HSR_DEV_NONE) && - !ether_addr_equal(node->MacAddressA, hsr_ethsup->ethhdr.h_source)) { - /* Cables have been swapped */ - list_del_rcu(&node->mac_list); - kfree_rcu(node, rcu_head); - node = NULL; - } - - if (node) - return node; - - node = find_node_by_AddrA(&hsr_priv->node_db, hsr_sp->MacAddressA); - if (node) { - /* Node is known, but frame was received from an unknown - * address. Node is PICS_SUBS capable; merge its AddrB. - */ - ether_addr_copy(node->MacAddressB, hsr_ethsup->ethhdr.h_source); - node->AddrB_if = dev_idx; - return node; - } + int i; node = kzalloc(sizeof(*node), GFP_ATOMIC); if (!node) return NULL; - ether_addr_copy(node->MacAddressA, hsr_sp->MacAddressA); - ether_addr_copy(node->MacAddressB, hsr_ethsup->ethhdr.h_source); - if (!ether_addr_equal(hsr_sp->MacAddressA, hsr_ethsup->ethhdr.h_source)) - node->AddrB_if = dev_idx; - else - node->AddrB_if = HSR_DEV_NONE; + ether_addr_copy(node->MacAddressA, addr); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; - for (i = 0; i < HSR_MAX_SLAVE; i++) + for (i = 0; i < HSR_PT_PORTS; i++) node->time_in[i] = now; - for (i = 0; i < HSR_MAX_DEV; i++) - node->seq_out[i] = ntohs(hsr_ethsup->hsr_sup.sequence_nr) - 1; + for (i = 0; i < HSR_PT_PORTS; i++) + node->seq_out[i] = seq_out; - list_add_tail_rcu(&node->mac_list, &hsr_priv->node_db); + list_add_tail_rcu(&node->mac_list, node_db); return node; } +/* Get the hsr_node from which 'skb' was sent. + */ +struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb, + bool is_sup) +{ + struct hsr_node *node; + struct ethhdr *ethhdr; + u16 seq_out; + + if (!skb_mac_header_was_set(skb)) + return NULL; + + ethhdr = (struct ethhdr *) skb_mac_header(skb); + + list_for_each_entry_rcu(node, node_db, mac_list) { + if (ether_addr_equal(node->MacAddressA, ethhdr->h_source)) + return node; + if (ether_addr_equal(node->MacAddressB, ethhdr->h_source)) + return node; + } + + if (!is_sup) + return NULL; /* Only supervision frame may create node entry */ + + if (ethhdr->h_proto == htons(ETH_P_PRP)) { + /* Use the existing sequence_nr from the tag as starting point + * for filtering duplicate frames. + */ + seq_out = hsr_get_skb_sequence_nr(skb) - 1; + } else { + WARN_ONCE(1, "%s: Non-HSR frame\n", __func__); + seq_out = 0; + } + + return hsr_add_node(node_db, ethhdr->h_source, seq_out); +} + +/* Use the Supervision frame's info about an eventual MacAddressB for merging + * nodes that has previously had their MacAddressB registered as a separate + * node. + */ +void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, + struct hsr_port *port_rcv) +{ + struct hsr_node *node_real; + struct hsr_sup_payload *hsr_sp; + struct list_head *node_db; + int i; + + skb_pull(skb, sizeof(struct hsr_ethhdr_sp)); + hsr_sp = (struct hsr_sup_payload *) skb->data; + + if (ether_addr_equal(eth_hdr(skb)->h_source, hsr_sp->MacAddressA)) + /* Not sent from MacAddressB of a PICS_SUBS capable node */ + goto done; + + /* Merge node_curr (registered on MacAddressB) into node_real */ + node_db = &port_rcv->hsr->node_db; + node_real = find_node_by_AddrA(node_db, hsr_sp->MacAddressA); + if (!node_real) + /* No frame received from AddrA of this node yet */ + node_real = hsr_add_node(node_db, hsr_sp->MacAddressA, + HSR_SEQNR_START - 1); + if (!node_real) + goto done; /* No mem */ + if (node_real == node_curr) + /* Node has already been merged */ + goto done; + + ether_addr_copy(node_real->MacAddressB, eth_hdr(skb)->h_source); + for (i = 0; i < HSR_PT_PORTS; i++) { + if (!node_curr->time_in_stale[i] && + time_after(node_curr->time_in[i], node_real->time_in[i])) { + node_real->time_in[i] = node_curr->time_in[i]; + node_real->time_in_stale[i] = node_curr->time_in_stale[i]; + } + if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i])) + node_real->seq_out[i] = node_curr->seq_out[i]; + } + node_real->AddrB_port = port_rcv->type; + + list_del_rcu(&node_curr->mac_list); + kfree_rcu(node_curr, rcu_head); + +done: + skb_push(skb, sizeof(struct hsr_ethhdr_sp)); +} + /* 'skb' is a frame meant for this host, that is to be passed to upper layers. * - * If the frame was sent by a node's B interface, replace the sender + * If the frame was sent by a node's B interface, replace the source * address with that node's "official" address (MacAddressA) so that upper * layers recognize where it came from. */ -void hsr_addr_subst_source(struct hsr_priv *hsr_priv, struct sk_buff *skb) +void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb) { - struct ethhdr *ethhdr; - struct node_entry *node; - if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__); return; } - ethhdr = (struct ethhdr *) skb_mac_header(skb); - rcu_read_lock(); - node = find_node_by_AddrB(&hsr_priv->node_db, ethhdr->h_source); - if (node) - ether_addr_copy(ethhdr->h_source, node->MacAddressA); - rcu_read_unlock(); + memcpy(ð_hdr(skb)->h_source, node->MacAddressA, ETH_ALEN); } - /* 'skb' is a frame meant for another host. - * 'hsr_dev_idx' is the HSR index of the outgoing device + * 'port' is the outgoing interface * * Substitute the target (dest) MAC address if necessary, so the it matches the * recipient interface MAC address, regardless of whether that is the @@ -264,47 +270,44 @@ void hsr_addr_subst_source(struct hsr_priv *hsr_priv, struct sk_buff *skb) * This is needed to keep the packets flowing through switches that learn on * which "side" the different interfaces are. */ -void hsr_addr_subst_dest(struct hsr_priv *hsr_priv, struct ethhdr *ethhdr, - enum hsr_dev_idx dev_idx) +void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, + struct hsr_port *port) { - struct node_entry *node; + struct hsr_node *node_dst; - rcu_read_lock(); - node = find_node_by_AddrA(&hsr_priv->node_db, ethhdr->h_dest); - if (node && (node->AddrB_if == dev_idx)) - ether_addr_copy(ethhdr->h_dest, node->MacAddressB); - rcu_read_unlock(); -} + if (!skb_mac_header_was_set(skb)) { + WARN_ONCE(1, "%s: Mac header not set\n", __func__); + return; + } + if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest)) + return; -/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, - * false otherwise. - */ -static bool seq_nr_after(u16 a, u16 b) -{ - /* Remove inconsistency where - * seq_nr_after(a, b) == seq_nr_before(a, b) - */ - if ((int) b - a == 32768) - return false; + node_dst = find_node_by_AddrA(&port->hsr->node_db, eth_hdr(skb)->h_dest); + if (!node_dst) { + WARN_ONCE(1, "%s: Unknown node\n", __func__); + return; + } + if (port->type != node_dst->AddrB_port) + return; - return (((s16) (b - a)) < 0); + ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->MacAddressB); } -#define seq_nr_before(a, b) seq_nr_after((b), (a)) -#define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b))) -#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) -void hsr_register_frame_in(struct node_entry *node, enum hsr_dev_idx dev_idx) +void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, + u16 sequence_nr) { - if ((dev_idx < 0) || (dev_idx >= HSR_MAX_SLAVE)) { - WARN_ONCE(1, "%s: Invalid dev_idx (%d)\n", __func__, dev_idx); + /* Don't register incoming frames without a valid sequence number. This + * ensures entries of restarted nodes gets pruned so that they can + * re-register and resume communications. + */ + if (seq_nr_before(sequence_nr, node->seq_out[port->type])) return; - } - node->time_in[dev_idx] = jiffies; - node->time_in_stale[dev_idx] = false; -} + node->time_in[port->type] = jiffies; + node->time_in_stale[port->type] = false; +} /* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid * ethhdr->h_source address and skb->mac_header set. @@ -314,102 +317,87 @@ void hsr_register_frame_in(struct node_entry *node, enum hsr_dev_idx dev_idx) * 0 otherwise, or * negative error code on error */ -int hsr_register_frame_out(struct node_entry *node, enum hsr_dev_idx dev_idx, - struct sk_buff *skb) +int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, + u16 sequence_nr) { - struct hsr_ethhdr *hsr_ethhdr; - u16 sequence_nr; - - if ((dev_idx < 0) || (dev_idx >= HSR_MAX_DEV)) { - WARN_ONCE(1, "%s: Invalid dev_idx (%d)\n", __func__, dev_idx); - return -EINVAL; - } - if (!skb_mac_header_was_set(skb)) { - WARN_ONCE(1, "%s: Mac header not set\n", __func__); - return -EINVAL; - } - hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb); - - sequence_nr = ntohs(hsr_ethhdr->hsr_tag.sequence_nr); - if (seq_nr_before_or_eq(sequence_nr, node->seq_out[dev_idx])) + if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type])) return 1; - node->seq_out[dev_idx] = sequence_nr; + node->seq_out[port->type] = sequence_nr; return 0; } - -static bool is_late(struct node_entry *node, enum hsr_dev_idx dev_idx) +static struct hsr_port *get_late_port(struct hsr_priv *hsr, + struct hsr_node *node) { - enum hsr_dev_idx other; - - if (node->time_in_stale[dev_idx]) - return true; - - if (dev_idx == HSR_DEV_SLAVE_A) - other = HSR_DEV_SLAVE_B; - else - other = HSR_DEV_SLAVE_A; - - if (node->time_in_stale[other]) - return false; + if (node->time_in_stale[HSR_PT_SLAVE_A]) + return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); + if (node->time_in_stale[HSR_PT_SLAVE_B]) + return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + + if (time_after(node->time_in[HSR_PT_SLAVE_B], + node->time_in[HSR_PT_SLAVE_A] + + msecs_to_jiffies(MAX_SLAVE_DIFF))) + return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); + if (time_after(node->time_in[HSR_PT_SLAVE_A], + node->time_in[HSR_PT_SLAVE_B] + + msecs_to_jiffies(MAX_SLAVE_DIFF))) + return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); - if (time_after(node->time_in[other], node->time_in[dev_idx] + - msecs_to_jiffies(MAX_SLAVE_DIFF))) - return true; - - return false; + return NULL; } /* Remove stale sequence_nr records. Called by timer every * HSR_LIFE_CHECK_INTERVAL (two seconds or so). */ -void hsr_prune_nodes(struct hsr_priv *hsr_priv) +void hsr_prune_nodes(unsigned long data) { - struct node_entry *node; + struct hsr_priv *hsr; + struct hsr_node *node; + struct hsr_port *port; unsigned long timestamp; unsigned long time_a, time_b; + hsr = (struct hsr_priv *) data; + rcu_read_lock(); - list_for_each_entry_rcu(node, &hsr_priv->node_db, mac_list) { + list_for_each_entry_rcu(node, &hsr->node_db, mac_list) { /* Shorthand */ - time_a = node->time_in[HSR_DEV_SLAVE_A]; - time_b = node->time_in[HSR_DEV_SLAVE_B]; + time_a = node->time_in[HSR_PT_SLAVE_A]; + time_b = node->time_in[HSR_PT_SLAVE_B]; /* Check for timestamps old enough to risk wrap-around */ if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET/2)) - node->time_in_stale[HSR_DEV_SLAVE_A] = true; + node->time_in_stale[HSR_PT_SLAVE_A] = true; if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET/2)) - node->time_in_stale[HSR_DEV_SLAVE_B] = true; + node->time_in_stale[HSR_PT_SLAVE_B] = true; /* Get age of newest frame from node. * At least one time_in is OK here; nodes get pruned long * before both time_ins can get stale */ timestamp = time_a; - if (node->time_in_stale[HSR_DEV_SLAVE_A] || - (!node->time_in_stale[HSR_DEV_SLAVE_B] && + if (node->time_in_stale[HSR_PT_SLAVE_A] || + (!node->time_in_stale[HSR_PT_SLAVE_B] && time_after(time_b, time_a))) timestamp = time_b; /* Warn of ring error only as long as we get frames at all */ if (time_is_after_jiffies(timestamp + msecs_to_jiffies(1.5*MAX_SLAVE_DIFF))) { - - if (is_late(node, HSR_DEV_SLAVE_A)) - hsr_nl_ringerror(hsr_priv, node->MacAddressA, - HSR_DEV_SLAVE_A); - else if (is_late(node, HSR_DEV_SLAVE_B)) - hsr_nl_ringerror(hsr_priv, node->MacAddressA, - HSR_DEV_SLAVE_B); + rcu_read_lock(); + port = get_late_port(hsr, node); + if (port != NULL) + hsr_nl_ringerror(hsr, node->MacAddressA, port); + rcu_read_unlock(); } /* Prune old entries */ if (time_is_before_jiffies(timestamp + msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { - hsr_nl_nodedown(hsr_priv, node->MacAddressA); + hsr_nl_nodedown(hsr, node->MacAddressA); list_del_rcu(&node->mac_list); /* Note that we need to free this entry later: */ kfree_rcu(node, rcu_head); @@ -419,21 +407,21 @@ void hsr_prune_nodes(struct hsr_priv *hsr_priv) } -void *hsr_get_next_node(struct hsr_priv *hsr_priv, void *_pos, +void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]) { - struct node_entry *node; + struct hsr_node *node; if (!_pos) { - node = list_first_or_null_rcu(&hsr_priv->node_db, - struct node_entry, mac_list); + node = list_first_or_null_rcu(&hsr->node_db, + struct hsr_node, mac_list); if (node) ether_addr_copy(addr, node->MacAddressA); return node; } node = _pos; - list_for_each_entry_continue_rcu(node, &hsr_priv->node_db, mac_list) { + list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) { ether_addr_copy(addr, node->MacAddressA); return node; } @@ -442,7 +430,7 @@ void *hsr_get_next_node(struct hsr_priv *hsr_priv, void *_pos, } -int hsr_get_node_data(struct hsr_priv *hsr_priv, +int hsr_get_node_data(struct hsr_priv *hsr, const unsigned char *addr, unsigned char addr_b[ETH_ALEN], unsigned int *addr_b_ifindex, @@ -451,12 +439,13 @@ int hsr_get_node_data(struct hsr_priv *hsr_priv, int *if2_age, u16 *if2_seq) { - struct node_entry *node; + struct hsr_node *node; + struct hsr_port *port; unsigned long tdiff; rcu_read_lock(); - node = find_node_by_AddrA(&hsr_priv->node_db, addr); + node = find_node_by_AddrA(&hsr->node_db, addr); if (!node) { rcu_read_unlock(); return -ENOENT; /* No such entry */ @@ -464,8 +453,8 @@ int hsr_get_node_data(struct hsr_priv *hsr_priv, ether_addr_copy(addr_b, node->MacAddressB); - tdiff = jiffies - node->time_in[HSR_DEV_SLAVE_A]; - if (node->time_in_stale[HSR_DEV_SLAVE_A]) + tdiff = jiffies - node->time_in[HSR_PT_SLAVE_A]; + if (node->time_in_stale[HSR_PT_SLAVE_A]) *if1_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) @@ -474,8 +463,8 @@ int hsr_get_node_data(struct hsr_priv *hsr_priv, else *if1_age = jiffies_to_msecs(tdiff); - tdiff = jiffies - node->time_in[HSR_DEV_SLAVE_B]; - if (node->time_in_stale[HSR_DEV_SLAVE_B]) + tdiff = jiffies - node->time_in[HSR_PT_SLAVE_B]; + if (node->time_in_stale[HSR_PT_SLAVE_B]) *if2_age = INT_MAX; #if HZ <= MSEC_PER_SEC else if (tdiff > msecs_to_jiffies(INT_MAX)) @@ -485,13 +474,15 @@ int hsr_get_node_data(struct hsr_priv *hsr_priv, *if2_age = jiffies_to_msecs(tdiff); /* Present sequence numbers as if they were incoming on interface */ - *if1_seq = node->seq_out[HSR_DEV_SLAVE_B]; - *if2_seq = node->seq_out[HSR_DEV_SLAVE_A]; + *if1_seq = node->seq_out[HSR_PT_SLAVE_B]; + *if2_seq = node->seq_out[HSR_PT_SLAVE_A]; - if ((node->AddrB_if != HSR_DEV_NONE) && hsr_priv->slave[node->AddrB_if]) - *addr_b_ifindex = hsr_priv->slave[node->AddrB_if]->ifindex; - else + if (node->AddrB_port != HSR_PT_NONE) { + port = hsr_port_get_hsr(hsr, node->AddrB_port); + *addr_b_ifindex = port->dev->ifindex; + } else { *addr_b_ifindex = -1; + } rcu_read_unlock(); diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index e6c4022030ad..438b40f98f5a 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,42 +6,43 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ -#ifndef _HSR_FRAMEREG_H -#define _HSR_FRAMEREG_H +#ifndef __HSR_FRAMEREG_H +#define __HSR_FRAMEREG_H #include "hsr_main.h" -struct node_entry; +struct hsr_node; -struct node_entry *hsr_find_node(struct list_head *node_db, struct sk_buff *skb); +struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], + u16 seq_out); +struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb, + bool is_sup); +void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, + struct hsr_port *port); +bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr); -struct node_entry *hsr_merge_node(struct hsr_priv *hsr_priv, - struct node_entry *node, - struct sk_buff *skb, - enum hsr_dev_idx dev_idx); +void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb); +void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, + struct hsr_port *port); -void hsr_addr_subst_source(struct hsr_priv *hsr_priv, struct sk_buff *skb); -void hsr_addr_subst_dest(struct hsr_priv *hsr_priv, struct ethhdr *ethhdr, - enum hsr_dev_idx dev_idx); +void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, + u16 sequence_nr); +int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, + u16 sequence_nr); -void hsr_register_frame_in(struct node_entry *node, enum hsr_dev_idx dev_idx); - -int hsr_register_frame_out(struct node_entry *node, enum hsr_dev_idx dev_idx, - struct sk_buff *skb); - -void hsr_prune_nodes(struct hsr_priv *hsr_priv); +void hsr_prune_nodes(unsigned long data); int hsr_create_self_node(struct list_head *self_node_db, unsigned char addr_a[ETH_ALEN], unsigned char addr_b[ETH_ALEN]); -void *hsr_get_next_node(struct hsr_priv *hsr_priv, void *_pos, +void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]); -int hsr_get_node_data(struct hsr_priv *hsr_priv, +int hsr_get_node_data(struct hsr_priv *hsr, const unsigned char *addr, unsigned char addr_b[ETH_ALEN], unsigned int *addr_b_ifindex, @@ -50,4 +51,4 @@ int hsr_get_node_data(struct hsr_priv *hsr_priv, int *if2_age, u16 *if2_seq); -#endif /* _HSR_FRAMEREG_H */ +#endif /* __HSR_FRAMEREG_H */ diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 3fee5218a691..779d28b65417 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,11 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com - * - * In addition to routines for registering and unregistering HSR support, this - * file also contains the receive routine that handles all incoming frames with - * Ethertype (protocol) ETH_P_PRP (HSRv0), and network device event handling. + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ #include <linux/netdevice.h> @@ -21,154 +17,71 @@ #include "hsr_device.h" #include "hsr_netlink.h" #include "hsr_framereg.h" - - -/* List of all registered virtual HSR devices */ -static LIST_HEAD(hsr_list); - -void register_hsr_master(struct hsr_priv *hsr_priv) -{ - list_add_tail_rcu(&hsr_priv->hsr_list, &hsr_list); -} - -void unregister_hsr_master(struct hsr_priv *hsr_priv) -{ - struct hsr_priv *hsr_priv_it; - - list_for_each_entry(hsr_priv_it, &hsr_list, hsr_list) - if (hsr_priv_it == hsr_priv) { - list_del_rcu(&hsr_priv_it->hsr_list); - return; - } -} - -bool is_hsr_slave(struct net_device *dev) -{ - struct hsr_priv *hsr_priv_it; - - list_for_each_entry_rcu(hsr_priv_it, &hsr_list, hsr_list) { - if (dev == hsr_priv_it->slave[0]) - return true; - if (dev == hsr_priv_it->slave[1]) - return true; - } - - return false; -} - - -/* If dev is a HSR slave device, return the virtual master device. Return NULL - * otherwise. - */ -static struct hsr_priv *get_hsr_master(struct net_device *dev) -{ - struct hsr_priv *hsr_priv; - - rcu_read_lock(); - list_for_each_entry_rcu(hsr_priv, &hsr_list, hsr_list) - if ((dev == hsr_priv->slave[0]) || - (dev == hsr_priv->slave[1])) { - rcu_read_unlock(); - return hsr_priv; - } - - rcu_read_unlock(); - return NULL; -} - - -/* If dev is a HSR slave device, return the other slave device. Return NULL - * otherwise. - */ -static struct net_device *get_other_slave(struct hsr_priv *hsr_priv, - struct net_device *dev) -{ - if (dev == hsr_priv->slave[0]) - return hsr_priv->slave[1]; - if (dev == hsr_priv->slave[1]) - return hsr_priv->slave[0]; - - return NULL; -} +#include "hsr_slave.h" static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, void *ptr) { - struct net_device *slave, *other_slave; - struct hsr_priv *hsr_priv; - int old_operstate; + struct net_device *dev; + struct hsr_port *port, *master; + struct hsr_priv *hsr; int mtu_max; int res; - struct net_device *dev; dev = netdev_notifier_info_to_dev(ptr); - - hsr_priv = get_hsr_master(dev); - if (hsr_priv) { - /* dev is a slave device */ - slave = dev; - other_slave = get_other_slave(hsr_priv, slave); - } else { + port = hsr_port_get_rtnl(dev); + if (port == NULL) { if (!is_hsr_master(dev)) - return NOTIFY_DONE; - hsr_priv = netdev_priv(dev); - slave = hsr_priv->slave[0]; - other_slave = hsr_priv->slave[1]; + return NOTIFY_DONE; /* Not an HSR device */ + hsr = netdev_priv(dev); + port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + } else { + hsr = port->hsr; } switch (event) { case NETDEV_UP: /* Administrative state DOWN */ case NETDEV_DOWN: /* Administrative state UP */ case NETDEV_CHANGE: /* Link (carrier) state changes */ - old_operstate = hsr_priv->dev->operstate; - hsr_set_carrier(hsr_priv->dev, slave, other_slave); - /* netif_stacked_transfer_operstate() cannot be used here since - * it doesn't set IF_OPER_LOWERLAYERDOWN (?) - */ - hsr_set_operstate(hsr_priv->dev, slave, other_slave); - hsr_check_announce(hsr_priv->dev, old_operstate); + hsr_check_carrier_and_operstate(hsr); break; case NETDEV_CHANGEADDR: - - /* This should not happen since there's no ndo_set_mac_address() - * for HSR devices - i.e. not supported. - */ - if (dev == hsr_priv->dev) + if (port->type == HSR_PT_MASTER) { + /* This should not happen since there's no + * ndo_set_mac_address() for HSR devices - i.e. not + * supported. + */ break; + } - if (dev == hsr_priv->slave[0]) - ether_addr_copy(hsr_priv->dev->dev_addr, - hsr_priv->slave[0]->dev_addr); + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + + if (port->type == HSR_PT_SLAVE_A) { + ether_addr_copy(master->dev->dev_addr, dev->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); + } /* Make sure we recognize frames from ourselves in hsr_rcv() */ - res = hsr_create_self_node(&hsr_priv->self_node_db, - hsr_priv->dev->dev_addr, - hsr_priv->slave[1] ? - hsr_priv->slave[1]->dev_addr : - hsr_priv->dev->dev_addr); + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + res = hsr_create_self_node(&hsr->self_node_db, + master->dev->dev_addr, + port ? + port->dev->dev_addr : + master->dev->dev_addr); if (res) - netdev_warn(hsr_priv->dev, + netdev_warn(master->dev, "Could not update HSR node address.\n"); - - if (dev == hsr_priv->slave[0]) - call_netdevice_notifiers(NETDEV_CHANGEADDR, hsr_priv->dev); break; case NETDEV_CHANGEMTU: - if (dev == hsr_priv->dev) + if (port->type == HSR_PT_MASTER) break; /* Handled in ndo_change_mtu() */ - mtu_max = hsr_get_max_mtu(hsr_priv); - if (hsr_priv->dev->mtu > mtu_max) - dev_set_mtu(hsr_priv->dev, mtu_max); + mtu_max = hsr_get_max_mtu(port->hsr); + master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); + master->dev->mtu = mtu_max; break; case NETDEV_UNREGISTER: - if (dev == hsr_priv->slave[0]) - hsr_priv->slave[0] = NULL; - if (dev == hsr_priv->slave[1]) - hsr_priv->slave[1] = NULL; - - /* There should really be a way to set a new slave device... */ - + hsr_del_port(port); break; case NETDEV_PRE_TYPE_CHANGE: /* HSR works only on Ethernet devices. Refuse slave to change @@ -181,255 +94,16 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, } -static struct timer_list prune_timer; - -static void prune_nodes_all(unsigned long data) -{ - struct hsr_priv *hsr_priv; - - rcu_read_lock(); - list_for_each_entry_rcu(hsr_priv, &hsr_list, hsr_list) - hsr_prune_nodes(hsr_priv); - rcu_read_unlock(); - - prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD); - add_timer(&prune_timer); -} - - -static struct sk_buff *hsr_pull_tag(struct sk_buff *skb) +struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { - struct hsr_tag *hsr_tag; - struct sk_buff *skb2; - - skb2 = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(!skb2)) - goto err_free; - skb = skb2; - - if (unlikely(!pskb_may_pull(skb, HSR_TAGLEN))) - goto err_free; + struct hsr_port *port; - hsr_tag = (struct hsr_tag *) skb->data; - skb->protocol = hsr_tag->encap_proto; - skb_pull(skb, HSR_TAGLEN); - - return skb; - -err_free: - kfree_skb(skb); + hsr_for_each_port(hsr, port) + if (port->type == pt) + return port; return NULL; } - -/* The uses I can see for these HSR supervision frames are: - * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type = - * 22") to reset any sequence_nr counters belonging to that node. Useful if - * the other node's counter has been reset for some reason. - * -- - * Or not - resetting the counter and bridging the frame would create a - * loop, unfortunately. - * - * 2) Use the LifeCheck frames to detect ring breaks. I.e. if no LifeCheck - * frame is received from a particular node, we know something is wrong. - * We just register these (as with normal frames) and throw them away. - * - * 3) Allow different MAC addresses for the two slave interfaces, using the - * MacAddressA field. - */ -static bool is_supervision_frame(struct hsr_priv *hsr_priv, struct sk_buff *skb) -{ - struct hsr_sup_tag *hsr_stag; - - if (!ether_addr_equal(eth_hdr(skb)->h_dest, - hsr_priv->sup_multicast_addr)) - return false; - - hsr_stag = (struct hsr_sup_tag *) skb->data; - if (get_hsr_stag_path(hsr_stag) != 0x0f) - return false; - if ((hsr_stag->HSR_TLV_Type != HSR_TLV_ANNOUNCE) && - (hsr_stag->HSR_TLV_Type != HSR_TLV_LIFE_CHECK)) - return false; - if (hsr_stag->HSR_TLV_Length != 12) - return false; - - return true; -} - - -/* Implementation somewhat according to IEC-62439-3, p. 43 - */ -static int hsr_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) -{ - struct hsr_priv *hsr_priv; - struct net_device *other_slave; - struct node_entry *node; - bool deliver_to_self; - struct sk_buff *skb_deliver; - enum hsr_dev_idx dev_in_idx, dev_other_idx; - bool dup_out; - int ret; - - hsr_priv = get_hsr_master(dev); - - if (!hsr_priv) { - /* Non-HSR-slave device 'dev' is connected to a HSR network */ - kfree_skb(skb); - dev->stats.rx_errors++; - return NET_RX_SUCCESS; - } - - if (dev == hsr_priv->slave[0]) { - dev_in_idx = HSR_DEV_SLAVE_A; - dev_other_idx = HSR_DEV_SLAVE_B; - } else { - dev_in_idx = HSR_DEV_SLAVE_B; - dev_other_idx = HSR_DEV_SLAVE_A; - } - - node = hsr_find_node(&hsr_priv->self_node_db, skb); - if (node) { - /* Always kill frames sent by ourselves */ - kfree_skb(skb); - return NET_RX_SUCCESS; - } - - /* Is this frame a candidate for local reception? */ - deliver_to_self = false; - if ((skb->pkt_type == PACKET_HOST) || - (skb->pkt_type == PACKET_MULTICAST) || - (skb->pkt_type == PACKET_BROADCAST)) - deliver_to_self = true; - else if (ether_addr_equal(eth_hdr(skb)->h_dest, - hsr_priv->dev->dev_addr)) { - skb->pkt_type = PACKET_HOST; - deliver_to_self = true; - } - - - rcu_read_lock(); /* node_db */ - node = hsr_find_node(&hsr_priv->node_db, skb); - - if (is_supervision_frame(hsr_priv, skb)) { - skb_pull(skb, sizeof(struct hsr_sup_tag)); - node = hsr_merge_node(hsr_priv, node, skb, dev_in_idx); - if (!node) { - rcu_read_unlock(); /* node_db */ - kfree_skb(skb); - hsr_priv->dev->stats.rx_dropped++; - return NET_RX_DROP; - } - skb_push(skb, sizeof(struct hsr_sup_tag)); - deliver_to_self = false; - } - - if (!node) { - /* Source node unknown; this might be a HSR frame from - * another net (different multicast address). Ignore it. - */ - rcu_read_unlock(); /* node_db */ - kfree_skb(skb); - return NET_RX_SUCCESS; - } - - /* Register ALL incoming frames as outgoing through the other interface. - * This allows us to register frames as incoming only if they are valid - * for the receiving interface, without using a specific counter for - * incoming frames. - */ - dup_out = hsr_register_frame_out(node, dev_other_idx, skb); - if (!dup_out) - hsr_register_frame_in(node, dev_in_idx); - - /* Forward this frame? */ - if (!dup_out && (skb->pkt_type != PACKET_HOST)) - other_slave = get_other_slave(hsr_priv, dev); - else - other_slave = NULL; - - if (hsr_register_frame_out(node, HSR_DEV_MASTER, skb)) - deliver_to_self = false; - - rcu_read_unlock(); /* node_db */ - - if (!deliver_to_self && !other_slave) { - kfree_skb(skb); - /* Circulated frame; silently remove it. */ - return NET_RX_SUCCESS; - } - - skb_deliver = skb; - if (deliver_to_self && other_slave) { - /* skb_clone() is not enough since we will strip the hsr tag - * and do address substitution below - */ - skb_deliver = pskb_copy(skb, GFP_ATOMIC); - if (!skb_deliver) { - deliver_to_self = false; - hsr_priv->dev->stats.rx_dropped++; - } - } - - if (deliver_to_self) { - bool multicast_frame; - - skb_deliver = hsr_pull_tag(skb_deliver); - if (!skb_deliver) { - hsr_priv->dev->stats.rx_dropped++; - goto forward; - } -#if !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) - /* Move everything in the header that is after the HSR tag, - * to work around alignment problems caused by the 6-byte HSR - * tag. In practice, this removes/overwrites the HSR tag in - * the header and restores a "standard" packet. - */ - memmove(skb_deliver->data - HSR_TAGLEN, skb_deliver->data, - skb_headlen(skb_deliver)); - - /* Adjust skb members so they correspond with the move above. - * This cannot possibly underflow skb->data since hsr_pull_tag() - * above succeeded. - * At this point in the protocol stack, the transport and - * network headers have not been set yet, and we haven't touched - * the mac header nor the head. So we only need to adjust data - * and tail: - */ - skb_deliver->data -= HSR_TAGLEN; - skb_deliver->tail -= HSR_TAGLEN; -#endif - skb_deliver->dev = hsr_priv->dev; - hsr_addr_subst_source(hsr_priv, skb_deliver); - multicast_frame = (skb_deliver->pkt_type == PACKET_MULTICAST); - ret = netif_rx(skb_deliver); - if (ret == NET_RX_DROP) { - hsr_priv->dev->stats.rx_dropped++; - } else { - hsr_priv->dev->stats.rx_packets++; - hsr_priv->dev->stats.rx_bytes += skb->len; - if (multicast_frame) - hsr_priv->dev->stats.multicast++; - } - } - -forward: - if (other_slave) { - skb_push(skb, ETH_HLEN); - skb->dev = other_slave; - dev_queue_xmit(skb); - } - - return NET_RX_SUCCESS; -} - - -static struct packet_type hsr_pt __read_mostly = { - .type = htons(ETH_P_PRP), - .func = hsr_rcv, -}; - static struct notifier_block hsr_nb = { .notifier_call = hsr_netdev_notify, /* Slave event notifications */ }; @@ -439,18 +113,9 @@ static int __init hsr_init(void) { int res; - BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_TAGLEN); - - dev_add_pack(&hsr_pt); - - init_timer(&prune_timer); - prune_timer.function = prune_nodes_all; - prune_timer.data = 0; - prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD); - add_timer(&prune_timer); + BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_HLEN); register_netdevice_notifier(&hsr_nb); - res = hsr_netlink_init(); return res; @@ -459,9 +124,7 @@ static int __init hsr_init(void) static void __exit hsr_exit(void) { unregister_netdevice_notifier(&hsr_nb); - del_timer_sync(&prune_timer); hsr_netlink_exit(); - dev_remove_pack(&hsr_pt); } module_init(hsr_init); diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 56fe060c0ab1..5a9c69962ded 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,11 +6,11 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ -#ifndef _HSR_PRIVATE_H -#define _HSR_PRIVATE_H +#ifndef __HSR_PRIVATE_H +#define __HSR_PRIVATE_H #include <linux/netdevice.h> #include <linux/list.h> @@ -29,6 +29,7 @@ * each node differ before we notify of communication problem? */ #define MAX_SLAVE_DIFF 3000 /* ms */ +#define HSR_SEQNR_START (USHRT_MAX - 1024) /* How often shall we check for broken ring and remove node entries older than @@ -46,16 +47,16 @@ * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest, * h_source, h_proto = 0x88FB }, and add { path, LSDU_size, sequence Nr, * encapsulated protocol } instead. + * + * Field names as defined in the IEC:2010 standard for HSR. */ -#define HSR_TAGLEN 6 - -/* Field names below as defined in the IEC:2010 standard for HSR. */ struct hsr_tag { __be16 path_and_LSDU_size; __be16 sequence_nr; __be16 encap_proto; } __packed; +#define HSR_HLEN 6 /* The helper functions below assumes that 'path' occupies the 4 most * significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or @@ -136,31 +137,47 @@ struct hsr_ethhdr_sp { } __packed; -enum hsr_dev_idx { - HSR_DEV_NONE = -1, - HSR_DEV_SLAVE_A = 0, - HSR_DEV_SLAVE_B, - HSR_DEV_MASTER, +enum hsr_port_type { + HSR_PT_NONE = 0, /* Must be 0, used by framereg */ + HSR_PT_SLAVE_A, + HSR_PT_SLAVE_B, + HSR_PT_INTERLINK, + HSR_PT_MASTER, + HSR_PT_PORTS, /* This must be the last item in the enum */ +}; + +struct hsr_port { + struct list_head port_list; + struct net_device *dev; + struct hsr_priv *hsr; + enum hsr_port_type type; }; -#define HSR_MAX_SLAVE (HSR_DEV_SLAVE_B + 1) -#define HSR_MAX_DEV (HSR_DEV_MASTER + 1) struct hsr_priv { - struct list_head hsr_list; /* List of hsr devices */ struct rcu_head rcu_head; - struct net_device *dev; - struct net_device *slave[HSR_MAX_SLAVE]; - struct list_head node_db; /* Other HSR nodes */ + struct list_head ports; + struct list_head node_db; /* Known HSR nodes */ struct list_head self_node_db; /* MACs of slaves */ struct timer_list announce_timer; /* Supervision frame dispatch */ + struct timer_list prune_timer; int announce_count; u16 sequence_nr; spinlock_t seqnr_lock; /* locking for sequence_nr */ unsigned char sup_multicast_addr[ETH_ALEN]; }; -void register_hsr_master(struct hsr_priv *hsr_priv); -void unregister_hsr_master(struct hsr_priv *hsr_priv); -bool is_hsr_slave(struct net_device *dev); +#define hsr_for_each_port(hsr, port) \ + list_for_each_entry_rcu((port), &(hsr)->ports, port_list) + +struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt); + +/* Caller must ensure skb is a valid HSR frame */ +static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) +{ + struct hsr_ethhdr *hsr_ethhdr; + + hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb); + return ntohs(hsr_ethhdr->hsr_tag.sequence_nr); +} -#endif /* _HSR_PRIVATE_H */ +#endif /* __HSR_PRIVATE_H */ diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 01a5261ac7a5..a2c7e4c0ac1e 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,7 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Routines for handling Netlink messages for HSR. */ @@ -37,13 +37,17 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, struct net_device *link[2]; unsigned char multicast_spec; + if (!data) { + netdev_info(dev, "HSR: No slave devices specified\n"); + return -EINVAL; + } if (!data[IFLA_HSR_SLAVE1]) { - netdev_info(dev, "IFLA_HSR_SLAVE1 missing!\n"); + netdev_info(dev, "HSR: Slave1 device not specified\n"); return -EINVAL; } link[0] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE1])); if (!data[IFLA_HSR_SLAVE2]) { - netdev_info(dev, "IFLA_HSR_SLAVE2 missing!\n"); + netdev_info(dev, "HSR: Slave2 device not specified\n"); return -EINVAL; } link[1] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE2])); @@ -63,21 +67,33 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) { - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; + struct hsr_port *port; + int res; - hsr_priv = netdev_priv(dev); + hsr = netdev_priv(dev); - if (hsr_priv->slave[0]) - if (nla_put_u32(skb, IFLA_HSR_SLAVE1, hsr_priv->slave[0]->ifindex)) - goto nla_put_failure; + res = 0; - if (hsr_priv->slave[1]) - if (nla_put_u32(skb, IFLA_HSR_SLAVE2, hsr_priv->slave[1]->ifindex)) - goto nla_put_failure; + rcu_read_lock(); + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); + if (port) + res = nla_put_u32(skb, IFLA_HSR_SLAVE1, port->dev->ifindex); + rcu_read_unlock(); + if (res) + goto nla_put_failure; + + rcu_read_lock(); + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + if (port) + res = nla_put_u32(skb, IFLA_HSR_SLAVE2, port->dev->ifindex); + rcu_read_unlock(); + if (res) + goto nla_put_failure; if (nla_put(skb, IFLA_HSR_SUPERVISION_ADDR, ETH_ALEN, - hsr_priv->sup_multicast_addr) || - nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr_priv->sequence_nr)) + hsr->sup_multicast_addr) || + nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr)) goto nla_put_failure; return 0; @@ -128,13 +144,13 @@ static const struct genl_multicast_group hsr_mcgrps[] = { * over one of the slave interfaces. This would indicate an open network ring * (i.e. a link has failed somewhere). */ -void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN], - enum hsr_dev_idx dev_idx) +void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN], + struct hsr_port *port) { struct sk_buff *skb; void *msg_head; + struct hsr_port *master; int res; - int ifindex; skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb) @@ -148,11 +164,7 @@ void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN], if (res < 0) goto nla_put_failure; - if (hsr_priv->slave[dev_idx]) - ifindex = hsr_priv->slave[dev_idx]->ifindex; - else - ifindex = -1; - res = nla_put_u32(skb, HSR_A_IFINDEX, ifindex); + res = nla_put_u32(skb, HSR_A_IFINDEX, port->dev->ifindex); if (res < 0) goto nla_put_failure; @@ -165,16 +177,20 @@ nla_put_failure: kfree_skb(skb); fail: - netdev_warn(hsr_priv->dev, "Could not send HSR ring error message\n"); + rcu_read_lock(); + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + netdev_warn(master->dev, "Could not send HSR ring error message\n"); + rcu_read_unlock(); } /* This is called when we haven't heard from the node with MAC address addr for * some time (just before the node is removed from the node table/list). */ -void hsr_nl_nodedown(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN]) +void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN]) { struct sk_buff *skb; void *msg_head; + struct hsr_port *master; int res; skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); @@ -199,7 +215,10 @@ nla_put_failure: kfree_skb(skb); fail: - netdev_warn(hsr_priv->dev, "Could not send HSR node down\n"); + rcu_read_lock(); + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + netdev_warn(master->dev, "Could not send HSR node down\n"); + rcu_read_unlock(); } @@ -220,7 +239,8 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) /* For sending */ struct sk_buff *skb_out; void *msg_head; - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; + struct hsr_port *port; unsigned char hsr_node_addr_b[ETH_ALEN]; int hsr_node_if1_age; u16 hsr_node_if1_seq; @@ -267,8 +287,8 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) if (res < 0) goto nla_put_failure; - hsr_priv = netdev_priv(hsr_dev); - res = hsr_get_node_data(hsr_priv, + hsr = netdev_priv(hsr_dev); + res = hsr_get_node_data(hsr, (unsigned char *) nla_data(info->attrs[HSR_A_NODE_ADDR]), hsr_node_addr_b, &addr_b_ifindex, @@ -301,9 +321,12 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq); if (res < 0) goto nla_put_failure; - if (hsr_priv->slave[0]) + rcu_read_lock(); + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); + if (port) res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX, - hsr_priv->slave[0]->ifindex); + port->dev->ifindex); + rcu_read_unlock(); if (res < 0) goto nla_put_failure; @@ -313,9 +336,14 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq); if (res < 0) goto nla_put_failure; - if (hsr_priv->slave[1]) + rcu_read_lock(); + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + if (port) res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX, - hsr_priv->slave[1]->ifindex); + port->dev->ifindex); + rcu_read_unlock(); + if (res < 0) + goto nla_put_failure; genlmsg_end(skb_out, msg_head); genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid); @@ -334,7 +362,7 @@ fail: return res; } -/* Get a list of MacAddressA of all nodes known to this node (other than self). +/* Get a list of MacAddressA of all nodes known to this node (including self). */ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) { @@ -345,7 +373,7 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) /* For sending */ struct sk_buff *skb_out; void *msg_head; - struct hsr_priv *hsr_priv; + struct hsr_priv *hsr; void *pos; unsigned char addr[ETH_ALEN]; int res; @@ -385,17 +413,17 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) if (res < 0) goto nla_put_failure; - hsr_priv = netdev_priv(hsr_dev); + hsr = netdev_priv(hsr_dev); rcu_read_lock(); - pos = hsr_get_next_node(hsr_priv, NULL, addr); + pos = hsr_get_next_node(hsr, NULL, addr); while (pos) { res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr); if (res < 0) { rcu_read_unlock(); goto nla_put_failure; } - pos = hsr_get_next_node(hsr_priv, pos, addr); + pos = hsr_get_next_node(hsr, pos, addr); } rcu_read_unlock(); diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h index d4579dcc3c7d..3f6b95b5b6b8 100644 --- a/net/hsr/hsr_netlink.h +++ b/net/hsr/hsr_netlink.h @@ -1,4 +1,4 @@ -/* Copyright 2011-2013 Autronica Fire and Security AS +/* Copyright 2011-2014 Autronica Fire and Security AS * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -6,7 +6,7 @@ * any later version. * * Author(s): - * 2011-2013 Arvid Brodin, arvid.brodin@xdin.com + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ #ifndef __HSR_NETLINK_H @@ -17,13 +17,14 @@ #include <uapi/linux/hsr_netlink.h> struct hsr_priv; +struct hsr_port; int __init hsr_netlink_init(void); void __exit hsr_netlink_exit(void); -void hsr_nl_ringerror(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN], - int dev_idx); -void hsr_nl_nodedown(struct hsr_priv *hsr_priv, unsigned char addr[ETH_ALEN]); +void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN], + struct hsr_port *port); +void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN]); void hsr_nl_framedrop(int dropcount, int dev_idx); void hsr_nl_linkdown(int dev_idx); diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c new file mode 100644 index 000000000000..a348dcbcd683 --- /dev/null +++ b/net/hsr/hsr_slave.c @@ -0,0 +1,196 @@ +/* Copyright 2011-2014 Autronica Fire and Security AS + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Author(s): + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + */ + +#include "hsr_slave.h" +#include <linux/etherdevice.h> +#include <linux/if_arp.h> +#include "hsr_main.h" +#include "hsr_device.h" +#include "hsr_forward.h" +#include "hsr_framereg.h" + + +static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + struct hsr_port *port; + + if (!skb_mac_header_was_set(skb)) { + WARN_ONCE(1, "%s: skb invalid", __func__); + return RX_HANDLER_PASS; + } + + rcu_read_lock(); /* hsr->node_db, hsr->ports */ + port = hsr_port_get_rcu(skb->dev); + + if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { + /* Directly kill frames sent by ourselves */ + kfree_skb(skb); + goto finish_consume; + } + + if (eth_hdr(skb)->h_proto != htons(ETH_P_PRP)) + goto finish_pass; + + skb_push(skb, ETH_HLEN); + + hsr_forward_skb(skb, port); + +finish_consume: + rcu_read_unlock(); /* hsr->node_db, hsr->ports */ + return RX_HANDLER_CONSUMED; + +finish_pass: + rcu_read_unlock(); /* hsr->node_db, hsr->ports */ + return RX_HANDLER_PASS; +} + +bool hsr_port_exists(const struct net_device *dev) +{ + return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame; +} + + +static int hsr_check_dev_ok(struct net_device *dev) +{ + /* Don't allow HSR on non-ethernet like devices */ + if ((dev->flags & IFF_LOOPBACK) || (dev->type != ARPHRD_ETHER) || + (dev->addr_len != ETH_ALEN)) { + netdev_info(dev, "Cannot use loopback or non-ethernet device as HSR slave.\n"); + return -EINVAL; + } + + /* Don't allow enslaving hsr devices */ + if (is_hsr_master(dev)) { + netdev_info(dev, "Cannot create trees of HSR devices.\n"); + return -EINVAL; + } + + if (hsr_port_exists(dev)) { + netdev_info(dev, "This device is already a HSR slave.\n"); + return -EINVAL; + } + + if (dev->priv_flags & IFF_802_1Q_VLAN) { + netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n"); + return -EINVAL; + } + + if (dev->priv_flags & IFF_DONT_BRIDGE) { + netdev_info(dev, "This device does not support bridging.\n"); + return -EOPNOTSUPP; + } + + /* HSR over bonded devices has not been tested, but I'm not sure it + * won't work... + */ + + return 0; +} + + +/* Setup device to be added to the HSR bridge. */ +static int hsr_portdev_setup(struct net_device *dev, struct hsr_port *port) +{ + int res; + + dev_hold(dev); + res = dev_set_promiscuity(dev, 1); + if (res) + goto fail_promiscuity; + + /* FIXME: + * What does net device "adjacency" mean? Should we do + * res = netdev_master_upper_dev_link(port->dev, port->hsr->dev); ? + */ + + res = netdev_rx_handler_register(dev, hsr_handle_frame, port); + if (res) + goto fail_rx_handler; + dev_disable_lro(dev); + + return 0; + +fail_rx_handler: + dev_set_promiscuity(dev, -1); +fail_promiscuity: + dev_put(dev); + + return res; +} + +int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, + enum hsr_port_type type) +{ + struct hsr_port *port, *master; + int res; + + if (type != HSR_PT_MASTER) { + res = hsr_check_dev_ok(dev); + if (res) + return res; + } + + port = hsr_port_get_hsr(hsr, type); + if (port != NULL) + return -EBUSY; /* This port already exists */ + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (port == NULL) + return -ENOMEM; + + if (type != HSR_PT_MASTER) { + res = hsr_portdev_setup(dev, port); + if (res) + goto fail_dev_setup; + } + + port->hsr = hsr; + port->dev = dev; + port->type = type; + + list_add_tail_rcu(&port->port_list, &hsr->ports); + synchronize_rcu(); + + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + netdev_update_features(master->dev); + dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); + + return 0; + +fail_dev_setup: + kfree(port); + return res; +} + +void hsr_del_port(struct hsr_port *port) +{ + struct hsr_priv *hsr; + struct hsr_port *master; + + hsr = port->hsr; + master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); + list_del_rcu(&port->port_list); + + if (port != master) { + netdev_update_features(master->dev); + dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); + netdev_rx_handler_unregister(port->dev); + dev_set_promiscuity(port->dev, -1); + } + + /* FIXME? + * netdev_upper_dev_unlink(port->dev, port->hsr->dev); + */ + + synchronize_rcu(); + dev_put(port->dev); +} diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h new file mode 100644 index 000000000000..3ccfbf71c92e --- /dev/null +++ b/net/hsr/hsr_slave.h @@ -0,0 +1,38 @@ +/* Copyright 2011-2014 Autronica Fire and Security AS + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Author(s): + * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + */ + +#ifndef __HSR_SLAVE_H +#define __HSR_SLAVE_H + +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include "hsr_main.h" + +int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, + enum hsr_port_type pt); +void hsr_del_port(struct hsr_port *port); +bool hsr_port_exists(const struct net_device *dev); + +static inline struct hsr_port *hsr_port_get_rtnl(const struct net_device *dev) +{ + ASSERT_RTNL(); + return hsr_port_exists(dev) ? + rtnl_dereference(dev->rx_handler_data) : NULL; +} + +static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev) +{ + return hsr_port_exists(dev) ? + rcu_dereference(dev->rx_handler_data) : NULL; +} + +#endif /* __HSR_SLAVE_H */ diff --git a/net/ieee802154/6lowpan_rtnl.c b/net/ieee802154/6lowpan_rtnl.c index fe6bd7a71081..44136297b673 100644 --- a/net/ieee802154/6lowpan_rtnl.c +++ b/net/ieee802154/6lowpan_rtnl.c @@ -71,28 +71,42 @@ struct lowpan_dev_record { struct list_head list; }; +/* don't save pan id, it's intra pan */ +struct lowpan_addr { + u8 mode; + union { + /* IPv6 needs big endian here */ + __be64 extended_addr; + __be16 short_addr; + } u; +}; + +struct lowpan_addr_info { + struct lowpan_addr daddr; + struct lowpan_addr saddr; +}; + static inline struct lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) { return netdev_priv(dev); } -static inline void lowpan_address_flip(u8 *src, u8 *dest) +static inline struct +lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb) { - int i; - for (i = 0; i < IEEE802154_ADDR_LEN; i++) - (dest)[IEEE802154_ADDR_LEN - i - 1] = (src)[i]; + WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct lowpan_addr_info)); + return (struct lowpan_addr_info *)(skb->data - + sizeof(struct lowpan_addr_info)); } -static int lowpan_header_create(struct sk_buff *skb, - struct net_device *dev, - unsigned short type, const void *_daddr, - const void *_saddr, unsigned int len) +static int lowpan_header_create(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *_daddr, + const void *_saddr, unsigned int len) { const u8 *saddr = _saddr; const u8 *daddr = _daddr; - struct ieee802154_addr sa, da; - struct ieee802154_mac_cb *cb = mac_cb_init(skb); + struct lowpan_addr_info *info; /* TODO: * if this package isn't ipv6 one, where should it be routed? @@ -106,45 +120,21 @@ static int lowpan_header_create(struct sk_buff *skb, raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8); raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8); - lowpan_header_compress(skb, dev, type, daddr, saddr, len); + info = lowpan_skb_priv(skb); - /* NOTE1: I'm still unsure about the fact that compression and WPAN - * header are created here and not later in the xmit. So wait for - * an opinion of net maintainers. - */ - /* NOTE2: to be absolutely correct, we must derive PANid information - * from MAC subif of the 'dev' and 'real_dev' network devices, but - * this isn't implemented in mainline yet, so currently we assign 0xff - */ - cb->type = IEEE802154_FC_TYPE_DATA; + /* TODO: Currently we only support extended_addr */ + info->daddr.mode = IEEE802154_ADDR_LONG; + memcpy(&info->daddr.u.extended_addr, daddr, + sizeof(info->daddr.u.extended_addr)); + info->saddr.mode = IEEE802154_ADDR_LONG; + memcpy(&info->saddr.u.extended_addr, saddr, + sizeof(info->daddr.u.extended_addr)); - /* prepare wpan address data */ - sa.mode = IEEE802154_ADDR_LONG; - sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); - sa.extended_addr = ieee802154_devaddr_from_raw(saddr); - - /* intra-PAN communications */ - da.pan_id = sa.pan_id; - - /* if the destination address is the broadcast address, use the - * corresponding short address - */ - if (lowpan_is_addr_broadcast(daddr)) { - da.mode = IEEE802154_ADDR_SHORT; - da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); - } else { - da.mode = IEEE802154_ADDR_LONG; - da.extended_addr = ieee802154_devaddr_from_raw(daddr); - } - - cb->ackreq = !lowpan_is_addr_broadcast(daddr); - - return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, - type, (void *)&da, (void *)&sa, 0); + return 0; } static int lowpan_give_skb_to_devices(struct sk_buff *skb, - struct net_device *dev) + struct net_device *dev) { struct lowpan_dev_record *entry; struct sk_buff *skb_cp; @@ -246,7 +236,7 @@ lowpan_alloc_frag(struct sk_buff *skb, int size, return ERR_PTR(-rc); } } else { - frag = ERR_PTR(ENOMEM); + frag = ERR_PTR(-ENOMEM); } return frag; @@ -338,13 +328,68 @@ err: return rc; } +static int lowpan_header(struct sk_buff *skb, struct net_device *dev) +{ + struct ieee802154_addr sa, da; + struct ieee802154_mac_cb *cb = mac_cb_init(skb); + struct lowpan_addr_info info; + void *daddr, *saddr; + + memcpy(&info, lowpan_skb_priv(skb), sizeof(info)); + + /* TODO: Currently we only support extended_addr */ + daddr = &info.daddr.u.extended_addr; + saddr = &info.saddr.u.extended_addr; + + lowpan_header_compress(skb, dev, ETH_P_IPV6, daddr, saddr, skb->len); + + cb->type = IEEE802154_FC_TYPE_DATA; + + /* prepare wpan address data */ + sa.mode = IEEE802154_ADDR_LONG; + sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev); + sa.extended_addr = ieee802154_devaddr_from_raw(saddr); + + /* intra-PAN communications */ + da.pan_id = sa.pan_id; + + /* if the destination address is the broadcast address, use the + * corresponding short address + */ + if (lowpan_is_addr_broadcast((const u8 *)daddr)) { + da.mode = IEEE802154_ADDR_SHORT; + da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); + cb->ackreq = false; + } else { + da.mode = IEEE802154_ADDR_LONG; + da.extended_addr = ieee802154_devaddr_from_raw(daddr); + cb->ackreq = true; + } + + return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, + ETH_P_IPV6, (void *)&da, (void *)&sa, 0); +} + static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) { struct ieee802154_hdr wpan_hdr; - int max_single; + int max_single, ret; pr_debug("package xmit\n"); + /* We must take a copy of the skb before we modify/replace the ipv6 + * header as the header could be used elsewhere + */ + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + + ret = lowpan_header(skb, dev); + if (ret < 0) { + kfree_skb(skb); + return NET_XMIT_DROP; + } + if (ieee802154_hdr_peek(skb, &wpan_hdr) < 0) { kfree_skb(skb); return NET_XMIT_DROP; @@ -368,24 +413,28 @@ static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev) static struct wpan_phy *lowpan_get_phy(const struct net_device *dev) { struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_phy(real_dev); } static __le16 lowpan_get_pan_id(const struct net_device *dev) { struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_pan_id(real_dev); } static __le16 lowpan_get_short_addr(const struct net_device *dev) { struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev); } static u8 lowpan_get_dsn(const struct net_device *dev) { struct net_device *real_dev = lowpan_dev_info(dev)->real_dev; + return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev); } @@ -433,7 +482,7 @@ static void lowpan_setup(struct net_device *dev) /* Frame Control + Sequence Number + Address fields + Security Header */ dev->hard_header_len = 2 + 1 + 20 + 14; dev->needed_tailroom = 2; /* FCS */ - dev->mtu = 1281; + dev->mtu = IPV6_MIN_MTU; dev->tx_queue_len = 0; dev->flags = IFF_BROADCAST | IFF_MULTICAST; dev->watchdog_timeo = 0; @@ -454,7 +503,7 @@ static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[]) } static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) + struct packet_type *pt, struct net_device *orig_dev) { struct ieee802154_hdr hdr; int ret; diff --git a/net/ieee802154/Kconfig b/net/ieee802154/Kconfig index 8af1330b3137..c0d4154d144f 100644 --- a/net/ieee802154/Kconfig +++ b/net/ieee802154/Kconfig @@ -12,13 +12,6 @@ config IEEE802154 config IEEE802154_6LOWPAN tristate "6lowpan support over IEEE 802.15.4" - depends on IEEE802154 && IPV6 - select 6LOWPAN_IPHC + depends on IEEE802154 && 6LOWPAN ---help--- IPv6 compression over IEEE 802.15.4. - -config 6LOWPAN_IPHC - tristate - ---help--- - 6lowpan compression code which is shared between IEEE 802.15.4 and Bluetooth - stacks. diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile index bf1b51497a41..3914b1ed4274 100644 --- a/net/ieee802154/Makefile +++ b/net/ieee802154/Makefile @@ -1,8 +1,7 @@ obj-$(CONFIG_IEEE802154) += ieee802154.o af_802154.o -obj-$(CONFIG_IEEE802154_6LOWPAN) += 6lowpan.o -obj-$(CONFIG_6LOWPAN_IPHC) += 6lowpan_iphc.o +obj-$(CONFIG_IEEE802154_6LOWPAN) += ieee802154_6lowpan.o -6lowpan-y := 6lowpan_rtnl.o reassembly.o +ieee802154_6lowpan-y := 6lowpan_rtnl.o reassembly.o ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o wpan-class.o \ header_ops.o af_802154-y := af_ieee802154.o raw.o dgram.o diff --git a/net/ieee802154/af_ieee802154.c b/net/ieee802154/af_ieee802154.c index 351d9a94ec2f..29e0de63001b 100644 --- a/net/ieee802154/af_ieee802154.c +++ b/net/ieee802154/af_ieee802154.c @@ -40,9 +40,7 @@ #include "af802154.h" -/* - * Utility function for families - */ +/* Utility function for families */ struct net_device* ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) { @@ -87,8 +85,8 @@ ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) rtnl_unlock(); break; default: - pr_warning("Unsupported ieee802154 address type: %d\n", - addr->mode); + pr_warn("Unsupported ieee802154 address type: %d\n", + addr->mode); break; } @@ -106,7 +104,7 @@ static int ieee802154_sock_release(struct socket *sock) return 0; } static int ieee802154_sock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t len) + struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; @@ -114,7 +112,7 @@ static int ieee802154_sock_sendmsg(struct kiocb *iocb, struct socket *sock, } static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, - int addr_len) + int addr_len) { struct sock *sk = sock->sk; @@ -125,7 +123,7 @@ static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, } static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) + int addr_len, int flags) { struct sock *sk = sock->sk; @@ -139,7 +137,7 @@ static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, } static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, - unsigned int cmd) + unsigned int cmd) { struct ifreq ifr; int ret = -ENOIOCTLCMD; @@ -167,7 +165,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, } static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, - unsigned long arg) + unsigned long arg) { struct sock *sk = sock->sk; @@ -238,8 +236,7 @@ static const struct proto_ops ieee802154_dgram_ops = { }; -/* - * Create a socket. Initialise the socket, blank the addresses +/* Create a socket. Initialise the socket, blank the addresses * set the state. */ static int ieee802154_create(struct net *net, struct socket *sock, @@ -301,13 +298,14 @@ static const struct net_proto_family ieee802154_family_ops = { }; static int ieee802154_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) + struct packet_type *pt, struct net_device *orig_dev) { if (!netif_running(dev)) goto drop; pr_debug("got frame, type %d, dev %p\n", dev->type, dev); #ifdef DEBUG - print_hex_dump_bytes("ieee802154_rcv ", DUMP_PREFIX_NONE, skb->data, skb->len); + print_hex_dump_bytes("ieee802154_rcv ", + DUMP_PREFIX_NONE, skb->data, skb->len); #endif if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c index 4f0ed8780194..ef2ad8aaef13 100644 --- a/net/ieee802154/dgram.c +++ b/net/ieee802154/dgram.c @@ -149,8 +149,7 @@ static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { - /* - * We will only return the amount + /* We will only return the amount * of this packet since that is all * that will be read. */ @@ -161,12 +160,13 @@ static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) } } + return -ENOIOCTLCMD; } /* FIXME: autobind */ static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, - int len) + int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct dgram_sock *ro = dgram_sk(sk); @@ -205,7 +205,7 @@ static int dgram_disconnect(struct sock *sk, int flags) } static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t size) + struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; @@ -248,8 +248,8 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, - msg->msg_flags & MSG_DONTWAIT, - &err); + msg->msg_flags & MSG_DONTWAIT, + &err); if (!skb) goto out_dev; @@ -262,7 +262,8 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk, cb->ackreq = ro->want_ack; if (msg->msg_name) { - DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name); + DECLARE_SOCKADDR(struct sockaddr_ieee802154*, + daddr, msg->msg_name); ieee802154_addr_from_sa(&dst_addr, &daddr->addr); } else { @@ -304,8 +305,8 @@ out: } static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, int noblock, int flags, - int *addr_len) + struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; @@ -398,6 +399,7 @@ int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) dgram_sk(sk))) { if (prev) { struct sk_buff *clone; + clone = skb_clone(skb, GFP_ATOMIC); if (clone) dgram_rcv_skb(prev, clone); @@ -407,9 +409,9 @@ int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) } } - if (prev) + if (prev) { dgram_rcv_skb(prev, skb); - else { + } else { kfree_skb(skb); ret = NET_RX_DROP; } @@ -419,7 +421,7 @@ int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) } static int dgram_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen) { struct dgram_sock *ro = dgram_sk(sk); @@ -463,7 +465,7 @@ static int dgram_getsockopt(struct sock *sk, int level, int optname, } static int dgram_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + char __user *optval, unsigned int optlen) { struct dgram_sock *ro = dgram_sk(sk); struct net *net = sock_net(sk); diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h index 8b83a231299e..5d352f86979e 100644 --- a/net/ieee802154/ieee802154.h +++ b/net/ieee802154/ieee802154.h @@ -43,7 +43,7 @@ struct genl_info; struct sk_buff *ieee802154_nl_create(int flags, u8 req); int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group); struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info, - int flags, u8 req); + int flags, u8 req); int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info); extern struct genl_family nl802154_family; diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c index 26efcf4fd2ff..9222966f5e6d 100644 --- a/net/ieee802154/netlink.c +++ b/net/ieee802154/netlink.c @@ -52,7 +52,7 @@ struct sk_buff *ieee802154_nl_create(int flags, u8 req) spin_lock_irqsave(&ieee802154_seq_lock, f); hdr = genlmsg_put(msg, 0, ieee802154_seq_num++, - &nl802154_family, flags, req); + &nl802154_family, flags, req); spin_unlock_irqrestore(&ieee802154_seq_lock, f); if (!hdr) { nlmsg_free(msg); @@ -86,7 +86,7 @@ struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info, return NULL; hdr = genlmsg_put_reply(msg, info, - &nl802154_family, flags, req); + &nl802154_family, flags, req); if (!hdr) { nlmsg_free(msg); return NULL; diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index a3281b8bfd5b..c6bfe22bfa5e 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -60,7 +60,8 @@ static __le16 nla_get_shortaddr(const struct nlattr *nla) } int ieee802154_nl_assoc_indic(struct net_device *dev, - struct ieee802154_addr *addr, u8 cap) + struct ieee802154_addr *addr, + u8 cap) { struct sk_buff *msg; @@ -93,7 +94,7 @@ nla_put_failure: EXPORT_SYMBOL(ieee802154_nl_assoc_indic); int ieee802154_nl_assoc_confirm(struct net_device *dev, __le16 short_addr, - u8 status) + u8 status) { struct sk_buff *msg; @@ -119,7 +120,8 @@ nla_put_failure: EXPORT_SYMBOL(ieee802154_nl_assoc_confirm); int ieee802154_nl_disassoc_indic(struct net_device *dev, - struct ieee802154_addr *addr, u8 reason) + struct ieee802154_addr *addr, + u8 reason) { struct sk_buff *msg; @@ -205,8 +207,9 @@ nla_put_failure: EXPORT_SYMBOL(ieee802154_nl_beacon_indic); int ieee802154_nl_scan_confirm(struct net_device *dev, - u8 status, u8 scan_type, u32 unscanned, u8 page, - u8 *edl/* , struct list_head *pan_desc_list */) + u8 status, u8 scan_type, + u32 unscanned, u8 page, + u8 *edl/* , struct list_head *pan_desc_list */) { struct sk_buff *msg; @@ -260,7 +263,7 @@ nla_put_failure: EXPORT_SYMBOL(ieee802154_nl_start_confirm); static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, - u32 seq, int flags, struct net_device *dev) + u32 seq, int flags, struct net_device *dev) { void *hdr; struct wpan_phy *phy; @@ -270,7 +273,7 @@ static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, pr_debug("%s\n", __func__); hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags, - IEEE802154_LIST_IFACE); + IEEE802154_LIST_IFACE); if (!hdr) goto out; @@ -330,14 +333,16 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info) if (info->attrs[IEEE802154_ATTR_DEV_NAME]) { char name[IFNAMSIZ + 1]; + nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME], - sizeof(name)); + sizeof(name)); dev = dev_get_by_name(&init_net, name); - } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) + } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) { dev = dev_get_by_index(&init_net, nla_get_u32(info->attrs[IEEE802154_ATTR_DEV_INDEX])); - else + } else { return NULL; + } if (!dev) return NULL; @@ -435,7 +440,7 @@ int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info) int ret = -EOPNOTSUPP; if ((!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] && - !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) || + !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) || !info->attrs[IEEE802154_ATTR_REASON]) return -EINVAL; @@ -464,8 +469,7 @@ out: return ret; } -/* - * PANid, channel, beacon_order = 15, superframe_order = 15, +/* PANid, channel, beacon_order = 15, superframe_order = 15, * PAN_coordinator, battery_life_extension = 0, * coord_realignment = 0, security_enable = 0 */ @@ -559,8 +563,8 @@ int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info) page = 0; - ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels, page, - duration); + ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels, + page, duration); out: dev_put(dev); @@ -570,7 +574,8 @@ out: int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info) { /* Request for interface name, index, type, IEEE address, - PAN Id, short address */ + * PAN Id, short address + */ struct sk_buff *msg; struct net_device *dev = NULL; int rc = -ENOBUFS; @@ -586,7 +591,7 @@ int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info) goto out_dev; rc = ieee802154_nl_fill_iface(msg, info->snd_portid, info->snd_seq, - 0, dev); + 0, dev); if (rc < 0) goto out_free; @@ -598,7 +603,6 @@ out_free: out_dev: dev_put(dev); return rc; - } int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb) @@ -616,7 +620,8 @@ int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb) goto cont; if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, dev) < 0) + cb->nlh->nlmsg_seq, + NLM_F_MULTI, dev) < 0) break; cont: idx++; @@ -765,6 +770,7 @@ ieee802154_llsec_parse_key_id(struct genl_info *info, case IEEE802154_SCF_KEY_SHORT_INDEX: { u32 source = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT]); + desc->short_source = cpu_to_le32(source); break; } @@ -842,7 +848,7 @@ int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info) goto out_dev; hdr = genlmsg_put(msg, 0, info->snd_seq, &nl802154_family, 0, - IEEE802154_LLSEC_GETPARAMS); + IEEE802154_LLSEC_GETPARAMS); if (!hdr) goto out_free; @@ -946,7 +952,7 @@ struct llsec_dump_data { static int ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb, - int (*step)(struct llsec_dump_data*)) + int (*step)(struct llsec_dump_data *)) { struct net *net = sock_net(skb->sk); struct net_device *dev; diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 89b265aea151..972baf83411a 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -36,7 +36,7 @@ #include "ieee802154.h" static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, - u32 seq, int flags, struct wpan_phy *phy) + u32 seq, int flags, struct wpan_phy *phy) { void *hdr; int i, pages = 0; @@ -48,7 +48,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, return -EMSGSIZE; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags, - IEEE802154_LIST_PHY); + IEEE802154_LIST_PHY); if (!hdr) goto out; @@ -80,7 +80,8 @@ out: int ieee802154_list_phy(struct sk_buff *skb, struct genl_info *info) { /* Request for interface name, index, type, IEEE address, - PAN Id, short address */ + * PAN Id, short address + */ struct sk_buff *msg; struct wpan_phy *phy; const char *name; @@ -105,7 +106,7 @@ int ieee802154_list_phy(struct sk_buff *skb, struct genl_info *info) goto out_dev; rc = ieee802154_nl_fill_phy(msg, info->snd_portid, info->snd_seq, - 0, phy); + 0, phy); if (rc < 0) goto out_free; @@ -117,7 +118,6 @@ out_free: out_dev: wpan_phy_put(phy); return rc; - } struct dump_phy_data { @@ -137,10 +137,10 @@ static int ieee802154_dump_phy_iter(struct wpan_phy *phy, void *_data) return 0; rc = ieee802154_nl_fill_phy(data->skb, - NETLINK_CB(data->cb->skb).portid, - data->cb->nlh->nlmsg_seq, - NLM_F_MULTI, - phy); + NETLINK_CB(data->cb->skb).portid, + data->cb->nlh->nlmsg_seq, + NLM_F_MULTI, + phy); if (rc < 0) { data->idx--; @@ -238,10 +238,9 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) addr.sa_family = ARPHRD_IEEE802154; nla_memcpy(&addr.sa_data, info->attrs[IEEE802154_ATTR_HW_ADDR], - IEEE802154_ADDR_LEN); + IEEE802154_ADDR_LEN); - /* - * strangely enough, some callbacks (inetdev_event) from + /* strangely enough, some callbacks (inetdev_event) from * dev_set_mac_address require RTNL_LOCK */ rtnl_lock(); diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c index 74d54fae33d7..9d1f64806f02 100644 --- a/net/ieee802154/raw.c +++ b/net/ieee802154/raw.c @@ -96,7 +96,7 @@ out: } static int raw_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) + int addr_len) { return -ENOTSUPP; } @@ -106,8 +106,8 @@ static int raw_disconnect(struct sock *sk, int flags) return 0; } -static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t size) +static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; @@ -145,7 +145,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, - msg->msg_flags & MSG_DONTWAIT, &err); + msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; @@ -235,7 +235,6 @@ void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb) bh_lock_sock(sk); if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) { - struct sk_buff *clone; clone = skb_clone(skb, GFP_ATOMIC); @@ -248,13 +247,13 @@ void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb) } static int raw_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen) { return -EOPNOTSUPP; } static int raw_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen) + char __user *optval, unsigned int optlen) { return -EOPNOTSUPP; } @@ -274,4 +273,3 @@ struct proto ieee802154_raw_prot = { .getsockopt = raw_getsockopt, .setsockopt = raw_setsockopt, }; - diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c index 6f1428c4870b..7cfcd6885225 100644 --- a/net/ieee802154/reassembly.c +++ b/net/ieee802154/reassembly.c @@ -30,6 +30,8 @@ #include "reassembly.h" +static const char lowpan_frags_cache_name[] = "lowpan-frags"; + struct lowpan_frag_info { __be16 d_tag; u16 d_size; @@ -50,29 +52,25 @@ static unsigned int lowpan_hash_frag(__be16 tag, u16 d_size, const struct ieee802154_addr *saddr, const struct ieee802154_addr *daddr) { - u32 c; - net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd)); - c = jhash_3words(ieee802154_addr_hash(saddr), - ieee802154_addr_hash(daddr), - (__force u32)(tag + (d_size << 16)), - lowpan_frags.rnd); - - return c & (INETFRAGS_HASHSZ - 1); + return jhash_3words(ieee802154_addr_hash(saddr), + ieee802154_addr_hash(daddr), + (__force u32)(tag + (d_size << 16)), + lowpan_frags.rnd); } -static unsigned int lowpan_hashfn(struct inet_frag_queue *q) +static unsigned int lowpan_hashfn(const struct inet_frag_queue *q) { - struct lowpan_frag_queue *fq; + const struct lowpan_frag_queue *fq; fq = container_of(q, struct lowpan_frag_queue, q); return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr); } -static bool lowpan_frag_match(struct inet_frag_queue *q, void *a) +static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a) { - struct lowpan_frag_queue *fq; - struct lowpan_create_arg *arg = a; + const struct lowpan_frag_queue *fq; + const struct lowpan_create_arg *arg = a; fq = container_of(q, struct lowpan_frag_queue, q); return fq->tag == arg->tag && fq->d_size == arg->d_size && @@ -80,10 +78,10 @@ static bool lowpan_frag_match(struct inet_frag_queue *q, void *a) ieee802154_addr_equal(&fq->daddr, arg->dst); } -static void lowpan_frag_init(struct inet_frag_queue *q, void *a) +static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { + const struct lowpan_create_arg *arg = a; struct lowpan_frag_queue *fq; - struct lowpan_create_arg *arg = a; fq = container_of(q, struct lowpan_frag_queue, q); @@ -103,7 +101,7 @@ static void lowpan_frag_expire(unsigned long data) spin_lock(&fq->q.lock); - if (fq->q.last_in & INET_FRAG_COMPLETE) + if (fq->q.flags & INET_FRAG_COMPLETE) goto out; inet_frag_kill(&fq->q, &lowpan_frags); @@ -128,7 +126,6 @@ fq_find(struct net *net, const struct lowpan_frag_info *frag_info, arg.src = src; arg.dst = dst; - read_lock(&lowpan_frags.lock); hash = lowpan_hash_frag(frag_info->d_tag, frag_info->d_size, src, dst); q = inet_frag_find(&ieee802154_lowpan->frags, @@ -147,7 +144,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, struct net_device *dev; int end, offset; - if (fq->q.last_in & INET_FRAG_COMPLETE) + if (fq->q.flags & INET_FRAG_COMPLETE) goto err; offset = lowpan_cb(skb)->d_offset << 3; @@ -159,14 +156,14 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, * or have different end, the segment is corrupted. */ if (end < fq->q.len || - ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) + ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) goto err; - fq->q.last_in |= INET_FRAG_LAST_IN; + fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ - if (fq->q.last_in & INET_FRAG_LAST_IN) + if (fq->q.flags & INET_FRAG_LAST_IN) goto err; fq->q.len = end; } @@ -206,13 +203,13 @@ found: if (frag_type == LOWPAN_DISPATCH_FRAG1) { /* Calculate uncomp. 6lowpan header to estimate full size */ fq->q.meat += lowpan_uncompress_size(skb, NULL); - fq->q.last_in |= INET_FRAG_FIRST_IN; + fq->q.flags |= INET_FRAG_FIRST_IN; } else { fq->q.meat += skb->len; } add_frag_mem_limit(&fq->q, skb->truesize); - if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { int res; unsigned long orefdst = skb->_skb_refdst; @@ -223,7 +220,6 @@ found: return res; } - inet_frag_lru_move(&fq->q); return -1; err: kfree_skb(skb); @@ -359,8 +355,6 @@ int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type) struct net *net = dev_net(skb->dev); struct lowpan_frag_info *frag_info = lowpan_cb(skb); struct ieee802154_addr source, dest; - struct netns_ieee802154_lowpan *ieee802154_lowpan = - net_ieee802154_lowpan(net); int err; source = mac_cb(skb)->source; @@ -370,14 +364,15 @@ int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type) if (err < 0) goto err; - if (frag_info->d_size > ieee802154_lowpan->max_dsize) + if (frag_info->d_size > IPV6_MIN_MTU) { + net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n"); goto err; - - inet_frag_evictor(&ieee802154_lowpan->frags, &lowpan_frags, false); + } fq = fq_find(net, frag_info, &source, &dest); if (fq != NULL) { int ret; + spin_lock(&fq->q.lock); ret = lowpan_frag_queue(fq, skb, frag_type); spin_unlock(&fq->q.lock); @@ -393,20 +388,25 @@ err: EXPORT_SYMBOL(lowpan_frag_rcv); #ifdef CONFIG_SYSCTL +static int zero; + static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .data = &init_net.ieee802154_lowpan.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh }, { .procname = "6lowpanfrag_low_thresh", .data = &init_net.ieee802154_lowpan.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh }, { .procname = "6lowpanfrag_time", @@ -415,20 +415,15 @@ static struct ctl_table lowpan_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "6lowpanfrag_max_datagram_size", - .data = &init_net.ieee802154_lowpan.max_dsize, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { } }; +/* secret interval has been deprecated */ +static int lowpan_frags_secret_interval_unused; static struct ctl_table lowpan_frags_ctl_table[] = { { .procname = "6lowpanfrag_secret_interval", - .data = &lowpan_frags.secret_interval, + .data = &lowpan_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -451,9 +446,11 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) goto err_alloc; table[0].data = &ieee802154_lowpan->frags.high_thresh; + table[0].extra1 = &ieee802154_lowpan->frags.low_thresh; + table[0].extra2 = &init_net.ieee802154_lowpan.frags.high_thresh; table[1].data = &ieee802154_lowpan->frags.low_thresh; + table[1].extra2 = &ieee802154_lowpan->frags.high_thresh; table[2].data = &ieee802154_lowpan->frags.timeout; - table[3].data = &ieee802154_lowpan->max_dsize; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) @@ -488,7 +485,7 @@ static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net) static struct ctl_table_header *lowpan_ctl_header; -static int lowpan_frags_sysctl_register(void) +static int __init lowpan_frags_sysctl_register(void) { lowpan_ctl_header = register_net_sysctl(&init_net, "net/ieee802154/6lowpan", @@ -510,7 +507,7 @@ static inline void lowpan_frags_ns_sysctl_unregister(struct net *net) { } -static inline int lowpan_frags_sysctl_register(void) +static inline int __init lowpan_frags_sysctl_register(void) { return 0; } @@ -528,7 +525,6 @@ static int __net_init lowpan_frags_init_net(struct net *net) ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; - ieee802154_lowpan->max_dsize = 0xFFFF; inet_frags_init_net(&ieee802154_lowpan->frags); @@ -568,8 +564,10 @@ int __init lowpan_net_frag_init(void) lowpan_frags.qsize = sizeof(struct frag_queue); lowpan_frags.match = lowpan_frag_match; lowpan_frags.frag_expire = lowpan_frag_expire; - lowpan_frags.secret_interval = 10 * 60 * HZ; - inet_frags_init(&lowpan_frags); + lowpan_frags.frags_cache_name = lowpan_frags_cache_name; + ret = inet_frags_init(&lowpan_frags); + if (ret) + goto err_pernet; return ret; err_pernet: diff --git a/net/ieee802154/wpan-class.c b/net/ieee802154/wpan-class.c index 8d6f6704da84..4955e0fe5883 100644 --- a/net/ieee802154/wpan-class.c +++ b/net/ieee802154/wpan-class.c @@ -48,7 +48,8 @@ MASTER_SHOW(transmit_power, "%d +- 1 dB"); MASTER_SHOW(cca_mode, "%d"); static ssize_t channels_supported_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, + char *buf) { struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); int ret; @@ -57,7 +58,7 @@ static ssize_t channels_supported_show(struct device *dev, mutex_lock(&phy->pib_lock); for (i = 0; i < 32; i++) { ret = snprintf(buf + len, PAGE_SIZE - len, - "%#09x\n", phy->channels_supported[i]); + "%#09x\n", phy->channels_supported[i]); if (ret < 0) break; len += ret; @@ -80,6 +81,7 @@ ATTRIBUTE_GROUPS(pmib); static void wpan_phy_release(struct device *d) { struct wpan_phy *phy = container_of(d, struct wpan_phy, dev); + kfree(phy); } @@ -121,11 +123,12 @@ static int wpan_phy_iter(struct device *dev, void *_data) { struct wpan_phy_iter_data *wpid = _data; struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); + return wpid->fn(phy, wpid->data); } int wpan_phy_for_each(int (*fn)(struct wpan_phy *phy, void *data), - void *data) + void *data) { struct wpan_phy_iter_data wpid = { .fn = fn, @@ -197,6 +200,7 @@ EXPORT_SYMBOL(wpan_phy_free); static int __init wpan_phy_class_init(void) { int rc; + rc = class_register(&wpan_phy_class); if (rc) goto err; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 05c57f0fcabe..e682b48e0709 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -307,6 +307,35 @@ config NET_IPVTI the notion of a secure tunnel for IPSEC and then use routing protocol on top. +config NET_UDP_TUNNEL + tristate + select NET_IP_TUNNEL + default n + +config NET_FOU + tristate "IP: Foo (IP protocols) over UDP" + select XFRM + select NET_UDP_TUNNEL + ---help--- + Foo over UDP allows any IP protocol to be directly encapsulated + over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP + network mechanisms and optimizations for UDP (such as ECMP + and RSS) can be leveraged to provide better service. + +config GENEVE + tristate "Generic Network Virtualization Encapsulation (Geneve)" + depends on INET + select NET_UDP_TUNNEL + ---help--- + This allows one to create Geneve virtual interfaces that provide + Layer 2 Networks over Layer 3 Networks. Geneve is often used + to tunnel virtual network infrastructure in virtualized environments. + For more information see: + http://tools.ietf.org/html/draft-gross-geneve-01 + + To compile this driver as a module, choose M here: the module + + config INET_AH tristate "IP: AH transformation" select XFRM_ALGO @@ -556,6 +585,27 @@ config TCP_CONG_ILLINOIS For further details see: http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html +config TCP_CONG_DCTCP + tristate "DataCenter TCP (DCTCP)" + default n + ---help--- + DCTCP leverages Explicit Congestion Notification (ECN) in the network to + provide multi-bit feedback to the end hosts. It is designed to provide: + + - High burst tolerance (incast due to partition/aggregate), + - Low latency (short flows, queries), + - High throughput (continuous data updates, large file transfers) with + commodity, shallow-buffered switches. + + All switches in the data center network running DCTCP must support + ECN marking and be configured for marking when reaching defined switch + buffer thresholds. The default ECN marking threshold heuristic for + DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets + (~100KB) at 10Gbps, but might need further careful tweaking. + + For further details see: + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -584,9 +634,11 @@ choice config DEFAULT_WESTWOOD bool "Westwood" if TCP_CONG_WESTWOOD=y + config DEFAULT_DCTCP + bool "DCTCP" if TCP_CONG_DCTCP=y + config DEFAULT_RENO bool "Reno" - endchoice endif @@ -606,6 +658,7 @@ config DEFAULT_TCP_CONG default "westwood" if DEFAULT_WESTWOOD default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO + default "dctcp" if DEFAULT_DCTCP default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f032688d20d3..518c04ed666e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,8 +20,10 @@ obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o gre-y := gre_demux.o +obj-$(CONFIG_NET_FOU) += fou.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o +obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o obj-$(CONFIG_NET_IPVTI) += ip_vti.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o @@ -41,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o +obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o @@ -53,6 +56,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o +obj-$(CONFIG_GENEVE) += geneve.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d156b3c5f363..92db7a69f2b9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -418,10 +418,6 @@ int inet_release(struct socket *sock) } EXPORT_SYMBOL(inet_release); -/* It is off by default, see below. */ -int sysctl_ip_nonlocal_bind __read_mostly; -EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); - int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; @@ -461,7 +457,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * is temporarily down) */ err = -EADDRNOTAVAIL; - if (!sysctl_ip_nonlocal_bind && + if (!net->ipv4.sysctl_ip_nonlocal_bind && !(inet->freebind || inet->transparent) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && @@ -1201,40 +1197,6 @@ int inet_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL(inet_sk_rebuild_header); -static int inet_gso_send_check(struct sk_buff *skb) -{ - const struct net_offload *ops; - const struct iphdr *iph; - int proto; - int ihl; - int err = -EINVAL; - - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) - goto out; - - iph = ip_hdr(skb); - ihl = iph->ihl * 4; - if (ihl < sizeof(*iph)) - goto out; - - proto = iph->protocol; - - /* Warning: after this point, iph might be no longer valid */ - if (unlikely(!pskb_may_pull(skb, ihl))) - goto out; - __skb_pull(skb, ihl); - - skb_reset_transport_header(skb); - err = -EPROTONOSUPPORT; - - ops = rcu_dereference(inet_offloads[proto]); - if (likely(ops && ops->callbacks.gso_send_check)) - err = ops->callbacks.gso_send_check(skb); - -out: - return err; -} - static struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -1407,6 +1369,9 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, * immediately following this IP hdr. */ + /* Note : No need to call skb_gro_postpull_rcsum() here, + * as we already checked checksum over ipv4 header was 0 + */ skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); @@ -1659,7 +1624,6 @@ static int ipv4_proc_init(void); static struct packet_offload ip_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .callbacks = { - .gso_send_check = inet_gso_send_check, .gso_segment = inet_gso_segment, .gro_receive = inet_gro_receive, .gro_complete = inet_gro_complete, @@ -1668,8 +1632,9 @@ static struct packet_offload ip_packet_offload __read_mostly = { static const struct net_offload ipip_offload = { .callbacks = { - .gso_send_check = inet_gso_send_check, .gso_segment = inet_gso_segment, + .gro_receive = inet_gro_receive, + .gro_complete = inet_gro_complete, }, }; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index a2afa89513a0..ac9a32ec3ee4 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -505,8 +505,6 @@ static int ah_init_state(struct xfrm_state *x) ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; - BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - if (x->props.flags & XFRM_STATE_ALIGN4) x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 1a9b99e04465..16acb59d665e 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -953,10 +953,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, { const struct arphdr *arp; + /* do not tweak dropwatch on an ARP we will ignore */ if (dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) - goto freeskb; + goto consumeskb; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) @@ -974,6 +975,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); +consumeskb: + consume_skb(skb); + return 0; freeskb: kfree_skb(skb); out_of_mem: diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 05b708bbdb0d..4715f25dfe03 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -246,7 +246,7 @@ static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len) * success, negative values on error. * */ -static int cipso_v4_cache_init(void) +static int __init cipso_v4_cache_init(void) { u32 iter; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index a3095fdefbed..90c0e8386116 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -76,6 +76,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; + inet_set_txhash(sk); inet->inet_id = jiffies; sk_dst_set(sk, &rt->dst); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e9449376b58e..214882e7d6de 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -180,11 +180,12 @@ static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); #ifdef CONFIG_SYSCTL -static void devinet_sysctl_register(struct in_device *idev); +static int devinet_sysctl_register(struct in_device *idev); static void devinet_sysctl_unregister(struct in_device *idev); #else -static void devinet_sysctl_register(struct in_device *idev) +static int devinet_sysctl_register(struct in_device *idev) { + return 0; } static void devinet_sysctl_unregister(struct in_device *idev) { @@ -232,6 +233,7 @@ EXPORT_SYMBOL(in_dev_finish_destroy); static struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; + int err = -ENOMEM; ASSERT_RTNL(); @@ -252,7 +254,13 @@ static struct in_device *inetdev_init(struct net_device *dev) /* Account for reference dev->ip_ptr (below) */ in_dev_hold(in_dev); - devinet_sysctl_register(in_dev); + err = devinet_sysctl_register(in_dev); + if (err) { + in_dev->dead = 1; + in_dev_put(in_dev); + in_dev = NULL; + goto out; + } ip_mc_init_dev(in_dev); if (dev->flags & IFF_UP) ip_mc_up(in_dev); @@ -260,7 +268,7 @@ static struct in_device *inetdev_init(struct net_device *dev) /* we can receive as soon as ip_ptr is set -- do this last */ rcu_assign_pointer(dev->ip_ptr, in_dev); out: - return in_dev; + return in_dev ?: ERR_PTR(err); out_kfree: kfree(in_dev); in_dev = NULL; @@ -1347,8 +1355,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (!in_dev) { if (event == NETDEV_REGISTER) { in_dev = inetdev_init(dev); - if (!in_dev) - return notifier_from_errno(-ENOMEM); + if (IS_ERR(in_dev)) + return notifier_from_errno(PTR_ERR(in_dev)); if (dev->flags & IFF_LOOPBACK) { IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); @@ -2182,11 +2190,21 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) kfree(t); } -static void devinet_sysctl_register(struct in_device *idev) +static int devinet_sysctl_register(struct in_device *idev) { - neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); - __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, + int err; + + if (!sysctl_dev_name_is_allowed(idev->dev->name)) + return -EINVAL; + + err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); + if (err) + return err; + err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, &idev->cnf); + if (err) + neigh_sysctl_unregister(idev->arp_parms); + return err; } static void devinet_sysctl_unregister(struct in_device *idev) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 255aa9946fe7..23104a3f2924 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -243,7 +243,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, int rpf, struct in_device *idev, u32 *itag) { - int ret, no_addr, accept_local; + int ret, no_addr; struct fib_result res; struct flowi4 fl4; struct net *net; @@ -258,16 +258,17 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, no_addr = idev->ifa_list == NULL; - accept_local = IN_DEV_ACCEPT_LOCAL(idev); fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; net = dev_net(dev); if (fib_lookup(net, &fl4, &res)) goto last_resort; - if (res.type != RTN_UNICAST) { - if (res.type != RTN_LOCAL || !accept_local) - goto e_inval; - } + if (res.type != RTN_UNICAST && + (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) + goto e_inval; + if (!rpf && !fib_num_tclassid_users(dev_net(dev)) && + (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) + goto last_resort; fib_combine_itag(itag, &res); dev_match = false; @@ -321,6 +322,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); if (!r && !fib_num_tclassid_users(dev_net(dev)) && + IN_DEV_ACCEPT_LOCAL(idev) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { *itag = 0; return 0; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b10cd43a4722..5b6efb3d2308 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -157,9 +157,12 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp) static void free_nh_exceptions(struct fib_nh *nh) { - struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fnhe_hash_bucket *hash; int i; + hash = rcu_dereference_protected(nh->nh_exceptions, 1); + if (!hash) + return; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; @@ -205,8 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); - if (nexthop_nh->nh_exceptions) - free_nh_exceptions(nexthop_nh); + free_nh_exceptions(nexthop_nh); rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5afeb5aa4c7c..e9cb2588e416 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -940,7 +940,7 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) last = li; } if (last) - hlist_add_after_rcu(&last->hlist, &new->hlist); + hlist_add_behind_rcu(&new->hlist, &last->hlist); else hlist_add_before_rcu(&new->hlist, &li->hlist); } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c new file mode 100644 index 000000000000..efa70ad44906 --- /dev/null +++ b/net/ipv4/fou.c @@ -0,0 +1,514 @@ +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <net/genetlink.h> +#include <net/gue.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/udp.h> +#include <net/udp_tunnel.h> +#include <net/xfrm.h> +#include <uapi/linux/fou.h> +#include <uapi/linux/genetlink.h> + +static DEFINE_SPINLOCK(fou_lock); +static LIST_HEAD(fou_list); + +struct fou { + struct socket *sock; + u8 protocol; + u16 port; + struct udp_offload udp_offloads; + struct list_head list; +}; + +struct fou_cfg { + u16 type; + u8 protocol; + struct udp_port_cfg udp_config; +}; + +static inline struct fou *fou_from_sock(struct sock *sk) +{ + return sk->sk_user_data; +} + +static int fou_udp_encap_recv_deliver(struct sk_buff *skb, + u8 protocol, size_t len) +{ + struct iphdr *iph = ip_hdr(skb); + + /* Remove 'len' bytes from the packet (UDP header and + * FOU header if present), modify the protocol to the one + * we found, and then call rcv_encap. + */ + iph->tot_len = htons(ntohs(iph->tot_len) - len); + __skb_pull(skb, len); + skb_postpull_rcsum(skb, udp_hdr(skb), len); + skb_reset_transport_header(skb); + + return -protocol; +} + +static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) +{ + struct fou *fou = fou_from_sock(sk); + + if (!fou) + return 1; + + return fou_udp_encap_recv_deliver(skb, fou->protocol, + sizeof(struct udphdr)); +} + +static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) +{ + struct fou *fou = fou_from_sock(sk); + size_t len; + struct guehdr *guehdr; + struct udphdr *uh; + + if (!fou) + return 1; + + len = sizeof(struct udphdr) + sizeof(struct guehdr); + if (!pskb_may_pull(skb, len)) + goto drop; + + uh = udp_hdr(skb); + guehdr = (struct guehdr *)&uh[1]; + + len += guehdr->hlen << 2; + if (!pskb_may_pull(skb, len)) + goto drop; + + if (guehdr->version != 0) + goto drop; + + if (guehdr->flags) { + /* No support yet */ + goto drop; + } + + return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len); +drop: + kfree_skb(skb); + return 0; +} + +static struct sk_buff **fou_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct net_offload *ops; + struct sk_buff **pp = NULL; + u8 proto = NAPI_GRO_CB(skb)->proto; + const struct net_offload **offloads; + + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[proto]); + if (!ops || !ops->callbacks.gro_receive) + goto out_unlock; + + pp = ops->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); + + return pp; +} + +static int fou_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct net_offload *ops; + u8 proto = NAPI_GRO_CB(skb)->proto; + int err = -ENOSYS; + const struct net_offload **offloads; + + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + goto out_unlock; + + err = ops->callbacks.gro_complete(skb, nhoff); + +out_unlock: + rcu_read_unlock(); + + return err; +} + +static struct sk_buff **gue_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff **pp = NULL; + struct sk_buff *p; + u8 proto; + struct guehdr *guehdr; + unsigned int hlen, guehlen; + unsigned int off; + int flush = 1; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*guehdr); + guehdr = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + guehdr = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!guehdr)) + goto out; + } + + proto = guehdr->next_hdr; + + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_receive)) + goto out_unlock; + + guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + + hlen = off + guehlen; + if (skb_gro_header_hard(skb, hlen)) { + guehdr = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!guehdr)) + goto out_unlock; + } + + flush = 0; + + for (p = *head; p; p = p->next) { + const struct guehdr *guehdr2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + guehdr2 = (struct guehdr *)(p->data + off); + + /* Compare base GUE header to be equal (covers + * hlen, version, next_hdr, and flags. + */ + if (guehdr->word != guehdr2->word) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + /* Compare optional fields are the same. */ + if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1], + guehdr->hlen << 2)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + skb_gro_pull(skb, guehlen); + + /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + skb_gro_postpull_rcsum(skb, guehdr, guehlen); + + pp = ops->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int gue_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct net_offload **offloads; + struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); + const struct net_offload *ops; + unsigned int guehlen; + u8 proto; + int err = -ENOENT; + + proto = guehdr->next_hdr; + + guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + goto out_unlock; + + err = ops->callbacks.gro_complete(skb, nhoff + guehlen); + +out_unlock: + rcu_read_unlock(); + return err; +} + +static int fou_add_to_port_list(struct fou *fou) +{ + struct fou *fout; + + spin_lock(&fou_lock); + list_for_each_entry(fout, &fou_list, list) { + if (fou->port == fout->port) { + spin_unlock(&fou_lock); + return -EALREADY; + } + } + + list_add(&fou->list, &fou_list); + spin_unlock(&fou_lock); + + return 0; +} + +static void fou_release(struct fou *fou) +{ + struct socket *sock = fou->sock; + struct sock *sk = sock->sk; + + udp_del_offload(&fou->udp_offloads); + + list_del(&fou->list); + + /* Remove hooks into tunnel socket */ + sk->sk_user_data = NULL; + + sock_release(sock); + + kfree(fou); +} + +static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) +{ + udp_sk(sk)->encap_rcv = fou_udp_recv; + fou->protocol = cfg->protocol; + fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; + fou->udp_offloads.port = cfg->udp_config.local_udp_port; + fou->udp_offloads.ipproto = cfg->protocol; + + return 0; +} + +static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) +{ + udp_sk(sk)->encap_rcv = gue_udp_recv; + fou->udp_offloads.callbacks.gro_receive = gue_gro_receive; + fou->udp_offloads.callbacks.gro_complete = gue_gro_complete; + fou->udp_offloads.port = cfg->udp_config.local_udp_port; + + return 0; +} + +static int fou_create(struct net *net, struct fou_cfg *cfg, + struct socket **sockp) +{ + struct fou *fou = NULL; + int err; + struct socket *sock = NULL; + struct sock *sk; + + /* Open UDP socket */ + err = udp_sock_create(net, &cfg->udp_config, &sock); + if (err < 0) + goto error; + + /* Allocate FOU port structure */ + fou = kzalloc(sizeof(*fou), GFP_KERNEL); + if (!fou) { + err = -ENOMEM; + goto error; + } + + sk = sock->sk; + + fou->port = cfg->udp_config.local_udp_port; + + /* Initial for fou type */ + switch (cfg->type) { + case FOU_ENCAP_DIRECT: + err = fou_encap_init(sk, fou, cfg); + if (err) + goto error; + break; + case FOU_ENCAP_GUE: + err = gue_encap_init(sk, fou, cfg); + if (err) + goto error; + break; + default: + err = -EINVAL; + goto error; + } + + udp_sk(sk)->encap_type = 1; + udp_encap_enable(); + + sk->sk_user_data = fou; + fou->sock = sock; + + udp_set_convert_csum(sk, true); + + sk->sk_allocation = GFP_ATOMIC; + + if (cfg->udp_config.family == AF_INET) { + err = udp_add_offload(&fou->udp_offloads); + if (err) + goto error; + } + + err = fou_add_to_port_list(fou); + if (err) + goto error; + + if (sockp) + *sockp = sock; + + return 0; + +error: + kfree(fou); + if (sock) + sock_release(sock); + + return err; +} + +static int fou_destroy(struct net *net, struct fou_cfg *cfg) +{ + struct fou *fou; + u16 port = cfg->udp_config.local_udp_port; + int err = -EINVAL; + + spin_lock(&fou_lock); + list_for_each_entry(fou, &fou_list, list) { + if (fou->port == port) { + udp_del_offload(&fou->udp_offloads); + fou_release(fou); + err = 0; + break; + } + } + spin_unlock(&fou_lock); + + return err; +} + +static struct genl_family fou_nl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = FOU_GENL_NAME, + .version = FOU_GENL_VERSION, + .maxattr = FOU_ATTR_MAX, + .netnsok = true, +}; + +static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { + [FOU_ATTR_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_AF] = { .type = NLA_U8, }, + [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, + [FOU_ATTR_TYPE] = { .type = NLA_U8, }, +}; + +static int parse_nl_config(struct genl_info *info, + struct fou_cfg *cfg) +{ + memset(cfg, 0, sizeof(*cfg)); + + cfg->udp_config.family = AF_INET; + + if (info->attrs[FOU_ATTR_AF]) { + u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); + + if (family != AF_INET && family != AF_INET6) + return -EINVAL; + + cfg->udp_config.family = family; + } + + if (info->attrs[FOU_ATTR_PORT]) { + u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]); + + cfg->udp_config.local_udp_port = port; + } + + if (info->attrs[FOU_ATTR_IPPROTO]) + cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]); + + if (info->attrs[FOU_ATTR_TYPE]) + cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]); + + return 0; +} + +static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) +{ + struct fou_cfg cfg; + int err; + + err = parse_nl_config(info, &cfg); + if (err) + return err; + + return fou_create(&init_net, &cfg, NULL); +} + +static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) +{ + struct fou_cfg cfg; + + parse_nl_config(info, &cfg); + + return fou_destroy(&init_net, &cfg); +} + +static const struct genl_ops fou_nl_ops[] = { + { + .cmd = FOU_CMD_ADD, + .doit = fou_nl_cmd_add_port, + .policy = fou_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = FOU_CMD_DEL, + .doit = fou_nl_cmd_rm_port, + .policy = fou_nl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + +static int __init fou_init(void) +{ + int ret; + + ret = genl_register_family_with_ops(&fou_nl_family, + fou_nl_ops); + + return ret; +} + +static void __exit fou_fini(void) +{ + struct fou *fou, *next; + + genl_unregister_family(&fou_nl_family); + + /* Close all the FOU sockets */ + + spin_lock(&fou_lock); + list_for_each_entry_safe(fou, next, &fou_list, list) + fou_release(fou); + spin_unlock(&fou_lock); +} + +module_init(fou_init); +module_exit(fou_fini); +MODULE_AUTHOR("Tom Herbert <therbert@google.com>"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c new file mode 100644 index 000000000000..065cd94c640c --- /dev/null +++ b/net/ipv4/geneve.c @@ -0,0 +1,373 @@ +/* + * Geneve: Generic Network Virtualization Encapsulation + * + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/skbuff.h> +#include <linux/rculist.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/igmp.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/hash.h> +#include <linux/ethtool.h> +#include <net/arp.h> +#include <net/ndisc.h> +#include <net/ip.h> +#include <net/ip_tunnels.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/rtnetlink.h> +#include <net/route.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/geneve.h> +#include <net/protocol.h> +#include <net/udp_tunnel.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6.h> +#include <net/addrconf.h> +#include <net/ip6_tunnel.h> +#include <net/ip6_checksum.h> +#endif + +#define PORT_HASH_BITS 8 +#define PORT_HASH_SIZE (1<<PORT_HASH_BITS) + +/* per-network namespace private data for this module */ +struct geneve_net { + struct hlist_head sock_list[PORT_HASH_SIZE]; + spinlock_t sock_lock; /* Protects sock_list */ +}; + +static int geneve_net_id; + +static struct workqueue_struct *geneve_wq; + +static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) +{ + return (struct genevehdr *)(udp_hdr(skb) + 1); +} + +static struct hlist_head *gs_head(struct net *net, __be16 port) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + + return &gn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; +} + +/* Find geneve socket based on network namespace and UDP port */ +static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port) +{ + struct geneve_sock *gs; + + hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) { + if (inet_sk(gs->sock->sk)->inet_sport == port) + return gs; + } + + return NULL; +} + +static void geneve_build_header(struct genevehdr *geneveh, + __be16 tun_flags, u8 vni[3], + u8 options_len, u8 *options) +{ + geneveh->ver = GENEVE_VER; + geneveh->opt_len = options_len / 4; + geneveh->oam = !!(tun_flags & TUNNEL_OAM); + geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); + geneveh->rsvd1 = 0; + memcpy(geneveh->vni, vni, 3); + geneveh->proto_type = htons(ETH_P_TEB); + geneveh->rsvd2 = 0; + + memcpy(geneveh->options, options, options_len); +} + +/* Transmit a fully formated Geneve frame. + * + * When calling this function. The skb->data should point + * to the geneve header which is fully formed. + * + * This function will add other UDP tunnel headers. + */ +int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + bool xnet) +{ + struct genevehdr *gnvh; + int min_headroom; + int err; + + skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx); + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) + + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) + return err; + + if (vlan_tx_tag_present(skb)) { + if (unlikely(!__vlan_put_tag(skb, + skb->vlan_proto, + vlan_tx_tag_get(skb)))) { + err = -ENOMEM; + return err; + } + skb->vlan_tci = 0; + } + + gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); + geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); + + return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst, + tos, ttl, df, src_port, dst_port, xnet); +} +EXPORT_SYMBOL_GPL(geneve_xmit_skb); + +static void geneve_notify_add_rx_port(struct geneve_sock *gs) +{ + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + int err; + + if (sa_family == AF_INET) { + err = udp_add_offload(&gs->udp_offloads); + if (err) + pr_warn("geneve: udp_add_offload failed with status %d\n", + err); + } +} + +/* Callback from net/ipv4/udp.c to receive packets */ +static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct genevehdr *geneveh; + struct geneve_sock *gs; + int opts_len; + + /* Need Geneve and inner Ethernet header to be present */ + if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) + goto error; + + /* Return packets with reserved bits set */ + geneveh = geneve_hdr(skb); + + if (unlikely(geneveh->ver != GENEVE_VER)) + goto error; + + if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) + goto error; + + opts_len = geneveh->opt_len * 4; + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, + htons(ETH_P_TEB))) + goto drop; + + gs = rcu_dereference_sk_user_data(sk); + if (!gs) + goto drop; + + gs->rcv(gs, skb); + return 0; + +drop: + /* Consume bad packet */ + kfree_skb(skb); + return 0; + +error: + /* Let the UDP layer deal with the skb */ + return 1; +} + +static void geneve_del_work(struct work_struct *work) +{ + struct geneve_sock *gs = container_of(work, struct geneve_sock, + del_work); + + udp_tunnel_sock_release(gs->sock); + kfree_rcu(gs, rcu); +} + +static struct socket *geneve_create_sock(struct net *net, bool ipv6, + __be16 port) +{ + struct socket *sock; + struct udp_port_cfg udp_conf; + int err; + + memset(&udp_conf, 0, sizeof(udp_conf)); + + if (ipv6) { + udp_conf.family = AF_INET6; + } else { + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + } + + udp_conf.local_udp_port = port; + + /* Open UDP socket */ + err = udp_sock_create(net, &udp_conf, &sock); + if (err < 0) + return ERR_PTR(err); + + return sock; +} + +/* Create new listen socket if needed */ +static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool ipv6) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_sock *gs; + struct socket *sock; + struct udp_tunnel_sock_cfg tunnel_cfg; + + gs = kzalloc(sizeof(*gs), GFP_KERNEL); + if (!gs) + return ERR_PTR(-ENOMEM); + + INIT_WORK(&gs->del_work, geneve_del_work); + + sock = geneve_create_sock(net, ipv6, port); + if (IS_ERR(sock)) { + kfree(gs); + return ERR_CAST(sock); + } + + gs->sock = sock; + atomic_set(&gs->refcnt, 1); + gs->rcv = rcv; + gs->rcv_data = data; + + /* Initialize the geneve udp offloads structure */ + gs->udp_offloads.port = port; + gs->udp_offloads.callbacks.gro_receive = NULL; + gs->udp_offloads.callbacks.gro_complete = NULL; + + spin_lock(&gn->sock_lock); + hlist_add_head_rcu(&gs->hlist, gs_head(net, port)); + geneve_notify_add_rx_port(gs); + spin_unlock(&gn->sock_lock); + + /* Mark socket as an encapsulation socket */ + tunnel_cfg.sk_user_data = gs; + tunnel_cfg.encap_type = 1; + tunnel_cfg.encap_rcv = geneve_udp_encap_recv; + tunnel_cfg.encap_destroy = NULL; + setup_udp_tunnel_sock(net, sock, &tunnel_cfg); + + return gs; +} + +struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool no_share, bool ipv6) +{ + struct geneve_sock *gs; + + gs = geneve_socket_create(net, port, rcv, data, ipv6); + if (!IS_ERR(gs)) + return gs; + + if (no_share) /* Return error if sharing is not allowed. */ + return ERR_PTR(-EINVAL); + + gs = geneve_find_sock(net, port); + if (gs) { + if (gs->rcv == rcv) + atomic_inc(&gs->refcnt); + else + gs = ERR_PTR(-EBUSY); + } else { + gs = ERR_PTR(-EINVAL); + } + + return gs; +} +EXPORT_SYMBOL_GPL(geneve_sock_add); + +void geneve_sock_release(struct geneve_sock *gs) +{ + if (!atomic_dec_and_test(&gs->refcnt)) + return; + + queue_work(geneve_wq, &gs->del_work); +} +EXPORT_SYMBOL_GPL(geneve_sock_release); + +static __net_init int geneve_init_net(struct net *net) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + unsigned int h; + + spin_lock_init(&gn->sock_lock); + + for (h = 0; h < PORT_HASH_SIZE; ++h) + INIT_HLIST_HEAD(&gn->sock_list[h]); + + return 0; +} + +static struct pernet_operations geneve_net_ops = { + .init = geneve_init_net, + .exit = NULL, + .id = &geneve_net_id, + .size = sizeof(struct geneve_net), +}; + +static int __init geneve_init_module(void) +{ + int rc; + + geneve_wq = alloc_workqueue("geneve", 0, 0); + if (!geneve_wq) + return -ENOMEM; + + rc = register_pernet_subsys(&geneve_net_ops); + if (rc) + return rc; + + pr_info("Geneve driver\n"); + + return 0; +} +late_initcall(geneve_init_module); + +static void __exit geneve_cleanup_module(void) +{ + destroy_workqueue(geneve_wq); +} +module_exit(geneve_cleanup_module); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>"); +MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); +MODULE_ALIAS_RTNL_LINK("geneve"); diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 0485bf7f8f03..4a7b5b2a1ce3 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -98,7 +98,6 @@ EXPORT_SYMBOL_GPL(gre_build_header); static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err) { - unsigned int ip_hlen = ip_hdrlen(skb); const struct gre_base_hdr *greh; __be32 *options; int hdr_len; @@ -106,7 +105,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) return -EINVAL; - greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); + greh = (struct gre_base_hdr *)skb_transport_header(skb); if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; @@ -116,7 +115,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (!pskb_may_pull(skb, hdr_len)) return -EINVAL; - greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); + greh = (struct gre_base_hdr *)skb_transport_header(skb); tpi->proto = greh->protocol; options = (__be32 *)(greh + 1); @@ -125,6 +124,10 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, *csum_err = true; return -EINVAL; } + + skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); + options++; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index f0bdd47bbbcb..a77729503071 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -15,13 +15,6 @@ #include <net/protocol.h> #include <net/gre.h> -static int gre_gso_send_check(struct sk_buff *skb) -{ - if (!skb->encapsulation) - return -EINVAL; - return 0; -} - static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -46,6 +39,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_IPIP))) goto out; + if (!skb->encapsulation) + goto out; + if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) goto out; @@ -74,7 +70,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) { + if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); goto out; } @@ -119,28 +115,6 @@ out: return segs; } -/* Compute the whole skb csum in s/w and store it, then verify GRO csum - * starting from gro_offset. - */ -static __sum16 gro_skb_checksum(struct sk_buff *skb) -{ - __sum16 sum; - - skb->csum = skb_checksum(skb, 0, skb->len, 0); - NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum, - csum_partial(skb->data, skb_gro_offset(skb), 0)); - sum = csum_fold(NAPI_GRO_CB(skb)->csum); - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { - if (unlikely(!sum) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } else { - skb->ip_summed = CHECKSUM_COMPLETE; - skb->csum_complete_sw = 1; - } - - return sum; -} - static struct sk_buff **gre_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -192,22 +166,16 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, if (unlikely(!greh)) goto out_unlock; } - if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */ - __sum16 csum = 0; - - if (skb->ip_summed == CHECKSUM_COMPLETE) - csum = csum_fold(NAPI_GRO_CB(skb)->csum); - /* Don't trust csum error calculated/reported by h/w */ - if (skb->ip_summed == CHECKSUM_NONE || csum != 0) - csum = gro_skb_checksum(skb); - - /* GRE CSUM is the 1's complement of the 1's complement sum - * of the GRE hdr plus payload so it should add up to 0xffff - * (and 0 after csum_fold()) just like the IPv4 hdr csum. - */ - if (csum) + + /* Don't bother verifying checksum if we're going to flush anyway. */ + if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) { + if (skb_gro_checksum_simple_validate(skb)) goto out_unlock; + + skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); } + flush = 0; for (p = *head; p; p = p->next) { @@ -284,7 +252,6 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload gre_offload = { .callbacks = { - .gso_send_check = gre_gso_send_check, .gso_segment = gre_gso_segment, .gro_receive = gre_gro_receive, .gro_complete = gre_gro_complete, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 42b7bcf8045b..5882f584910e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -231,12 +231,62 @@ static inline void icmp_xmit_unlock(struct sock *sk) spin_unlock_bh(&sk->sk_lock.slock); } +int sysctl_icmp_msgs_per_sec __read_mostly = 1000; +int sysctl_icmp_msgs_burst __read_mostly = 50; + +static struct { + spinlock_t lock; + u32 credit; + u32 stamp; +} icmp_global = { + .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock), +}; + +/** + * icmp_global_allow - Are we allowed to send one more ICMP message ? + * + * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec. + * Returns false if we reached the limit and can not send another packet. + * Note: called with BH disabled + */ +bool icmp_global_allow(void) +{ + u32 credit, delta, incr = 0, now = (u32)jiffies; + bool rc = false; + + /* Check if token bucket is empty and cannot be refilled + * without taking the spinlock. + */ + if (!icmp_global.credit) { + delta = min_t(u32, now - icmp_global.stamp, HZ); + if (delta < HZ / 50) + return false; + } + + spin_lock(&icmp_global.lock); + delta = min_t(u32, now - icmp_global.stamp, HZ); + if (delta >= HZ / 50) { + incr = sysctl_icmp_msgs_per_sec * delta / HZ ; + if (incr) + icmp_global.stamp = now; + } + credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); + if (credit) { + credit--; + rc = true; + } + icmp_global.credit = credit; + spin_unlock(&icmp_global.lock); + return rc; +} +EXPORT_SYMBOL(icmp_global_allow); + /* * Send an ICMP frame. */ -static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, - struct flowi4 *fl4, int type, int code) +static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, + struct flowi4 *fl4, int type, int code) { struct dst_entry *dst = &rt->dst; bool rc = true; @@ -253,8 +303,14 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, goto out; /* Limit if icmp type is enabled in ratemask. */ - if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { - struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); + if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) + goto out; + + rc = false; + if (icmp_global_allow()) { + struct inet_peer *peer; + + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); if (peer) @@ -663,16 +719,16 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) /* Checkin full IP header plus 8 bytes of protocol to * avoid additional coding at protocol handlers. */ - if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { + ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); return; + } raw_icmp_error(skb, protocol, info); - rcu_read_lock(); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, info); - rcu_read_unlock(); } static bool icmp_tag_validation(int proto) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index db710b059bab..fb70e3ecc3e4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -117,7 +117,7 @@ #define IGMP_V2_Unsolicited_Report_Interval (10*HZ) #define IGMP_V3_Unsolicited_Report_Interval (1*HZ) #define IGMP_Query_Response_Interval (10*HZ) -#define IGMP_Unsolicited_Report_Count 2 +#define IGMP_Query_Robustness_Variable 2 #define IGMP_Initial_Report_Delay (1) @@ -756,8 +756,7 @@ static void igmp_ifc_event(struct in_device *in_dev) { if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv; igmp_ifc_start_timer(in_dev, 1); } @@ -932,7 +931,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, in_dev->mr_qrv = ih3->qrv; if (!group) { /* general query */ if (ih3->nsrcs) - return false; /* no sources allowed */ + return true; /* no sources allowed */ igmp_gq_start_timer(in_dev); return false; } @@ -1086,8 +1085,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; @@ -1226,8 +1224,7 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; igmp_ifc_event(in_dev); #endif } @@ -1321,8 +1318,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) atomic_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST - setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); - im->unsolicit_count = IGMP_Unsolicited_Report_Count; + setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); + im->unsolicit_count = sysctl_igmp_qrv; #endif im->next_rcu = in_dev->mc_list; @@ -1460,7 +1457,7 @@ void ip_mc_init_dev(struct in_device *in_dev) (unsigned long)in_dev); setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, (unsigned long)in_dev); - in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; + in_dev->mr_qrv = sysctl_igmp_qrv; #endif spin_lock_init(&in_dev->mc_tomb_lock); @@ -1474,6 +1471,9 @@ void ip_mc_up(struct in_device *in_dev) ASSERT_RTNL(); +#ifdef CONFIG_IP_MULTICAST + in_dev->mr_qrv = sysctl_igmp_qrv; +#endif ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) @@ -1540,7 +1540,9 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) */ int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; - +#ifdef CONFIG_IP_MULTICAST +int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable; +#endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) @@ -1575,8 +1577,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { - psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; @@ -1639,8 +1640,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -1818,8 +1818,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ - pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; + pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -2539,7 +2538,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) querier = "NONE"; #endif - if (rcu_dereference(state->in_dev->mc_list) == im) { + if (rcu_access_pointer(state->in_dev->mc_list) == im) { seq_printf(seq, "%d\t%-10s: %5d %7s\n", state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 3b01959bf4bb..9eb89f3f0ee4 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,6 +25,12 @@ #include <net/inet_frag.h> #include <net/inet_ecn.h> +#define INETFRAGS_EVICT_BUCKETS 128 +#define INETFRAGS_EVICT_MAX 512 + +/* don't rebuild inetfrag table with new secret more often than this */ +#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) + /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -46,24 +52,39 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static void inet_frag_secret_rebuild(unsigned long dummy) +static unsigned int +inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) +{ + return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); +} + +static bool inet_frag_may_rebuild(struct inet_frags *f) +{ + return time_after(jiffies, + f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); +} + +static void inet_frag_secret_rebuild(struct inet_frags *f) { - struct inet_frags *f = (struct inet_frags *)dummy; - unsigned long now = jiffies; int i; - /* Per bucket lock NOT needed here, due to write lock protection */ - write_lock(&f->lock); + write_seqlock_bh(&f->rnd_seqlock); + + if (!inet_frag_may_rebuild(f)) + goto out; get_random_bytes(&f->rnd, sizeof(u32)); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; struct hlist_node *n; hb = &f->hash[i]; + spin_lock(&hb->chain_lock); + hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = f->hashfn(q); + unsigned int hval = inet_frag_hashfn(f, q); if (hval != i) { struct inet_frag_bucket *hb_dest; @@ -72,76 +93,200 @@ static void inet_frag_secret_rebuild(unsigned long dummy) /* Relink to new hash chain. */ hb_dest = &f->hash[hval]; + + /* This is the only place where we take + * another chain_lock while already holding + * one. As this will not run concurrently, + * we cannot deadlock on hb_dest lock below, if its + * already locked it will be released soon since + * other caller cannot be waiting for hb lock + * that we've taken above. + */ + spin_lock_nested(&hb_dest->chain_lock, + SINGLE_DEPTH_NESTING); hlist_add_head(&q->list, &hb_dest->chain); + spin_unlock(&hb_dest->chain_lock); } } + spin_unlock(&hb->chain_lock); + } + + f->rebuild = false; + f->last_rebuild_jiffies = jiffies; +out: + write_sequnlock_bh(&f->rnd_seqlock); +} + +static bool inet_fragq_should_evict(const struct inet_frag_queue *q) +{ + return q->net->low_thresh == 0 || + frag_mem_limit(q->net) >= q->net->low_thresh; +} + +static unsigned int +inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) +{ + struct inet_frag_queue *fq; + struct hlist_node *n; + unsigned int evicted = 0; + HLIST_HEAD(expired); + +evict_again: + spin_lock(&hb->chain_lock); + + hlist_for_each_entry_safe(fq, n, &hb->chain, list) { + if (!inet_fragq_should_evict(fq)) + continue; + + if (!del_timer(&fq->timer)) { + /* q expiring right now thus increment its refcount so + * it won't be freed under us and wait until the timer + * has finished executing then destroy it + */ + atomic_inc(&fq->refcnt); + spin_unlock(&hb->chain_lock); + del_timer_sync(&fq->timer); + WARN_ON(atomic_read(&fq->refcnt) != 1); + inet_frag_put(fq, f); + goto evict_again; + } + + fq->flags |= INET_FRAG_EVICTED; + hlist_del(&fq->list); + hlist_add_head(&fq->list, &expired); + ++evicted; } - write_unlock(&f->lock); - mod_timer(&f->secret_timer, now + f->secret_interval); + spin_unlock(&hb->chain_lock); + + hlist_for_each_entry_safe(fq, n, &expired, list) + f->frag_expire((unsigned long) fq); + + return evicted; } -void inet_frags_init(struct inet_frags *f) +static void inet_frag_worker(struct work_struct *work) +{ + unsigned int budget = INETFRAGS_EVICT_BUCKETS; + unsigned int i, evicted = 0; + struct inet_frags *f; + + f = container_of(work, struct inet_frags, frags_work); + + BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); + + local_bh_disable(); + + for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { + evicted += inet_evict_bucket(f, &f->hash[i]); + i = (i + 1) & (INETFRAGS_HASHSZ - 1); + if (evicted > INETFRAGS_EVICT_MAX) + break; + } + + f->next_bucket = i; + + local_bh_enable(); + + if (f->rebuild && inet_frag_may_rebuild(f)) + inet_frag_secret_rebuild(f); +} + +static void inet_frag_schedule_worker(struct inet_frags *f) +{ + if (unlikely(!work_pending(&f->frags_work))) + schedule_work(&f->frags_work); +} + +int inet_frags_init(struct inet_frags *f) { int i; + INIT_WORK(&f->frags_work, inet_frag_worker); + for (i = 0; i < INETFRAGS_HASHSZ; i++) { struct inet_frag_bucket *hb = &f->hash[i]; spin_lock_init(&hb->chain_lock); INIT_HLIST_HEAD(&hb->chain); } - rwlock_init(&f->lock); - setup_timer(&f->secret_timer, inet_frag_secret_rebuild, - (unsigned long)f); - f->secret_timer.expires = jiffies + f->secret_interval; - add_timer(&f->secret_timer); + seqlock_init(&f->rnd_seqlock); + f->last_rebuild_jiffies = 0; + f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, + NULL); + if (!f->frags_cachep) + return -ENOMEM; + + return 0; } EXPORT_SYMBOL(inet_frags_init); void inet_frags_init_net(struct netns_frags *nf) { - nf->nqueues = 0; init_frag_mem_limit(nf); - INIT_LIST_HEAD(&nf->lru_list); - spin_lock_init(&nf->lru_lock); } EXPORT_SYMBOL(inet_frags_init_net); void inet_frags_fini(struct inet_frags *f) { - del_timer(&f->secret_timer); + cancel_work_sync(&f->frags_work); + kmem_cache_destroy(f->frags_cachep); } EXPORT_SYMBOL(inet_frags_fini); void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) { - nf->low_thresh = 0; + unsigned int seq; + int i; + nf->low_thresh = 0; local_bh_disable(); - inet_frag_evictor(nf, f, true); + +evict_again: + seq = read_seqbegin(&f->rnd_seqlock); + + for (i = 0; i < INETFRAGS_HASHSZ ; i++) + inet_evict_bucket(f, &f->hash[i]); + + if (read_seqretry(&f->rnd_seqlock, seq)) + goto evict_again; + local_bh_enable(); percpu_counter_destroy(&nf->mem); } EXPORT_SYMBOL(inet_frags_exit_net); -static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +static struct inet_frag_bucket * +get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) +__acquires(hb->chain_lock) { struct inet_frag_bucket *hb; - unsigned int hash; + unsigned int seq, hash; + + restart: + seq = read_seqbegin(&f->rnd_seqlock); - read_lock(&f->lock); - hash = f->hashfn(fq); + hash = inet_frag_hashfn(f, fq); hb = &f->hash[hash]; spin_lock(&hb->chain_lock); + if (read_seqretry(&f->rnd_seqlock, seq)) { + spin_unlock(&hb->chain_lock); + goto restart; + } + + return hb; +} + +static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +{ + struct inet_frag_bucket *hb; + + hb = get_frag_bucket_locked(fq, f); hlist_del(&fq->list); spin_unlock(&hb->chain_lock); - - read_unlock(&f->lock); - inet_frag_lru_del(fq); } void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) @@ -149,30 +294,29 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) if (del_timer(&fq->timer)) atomic_dec(&fq->refcnt); - if (!(fq->last_in & INET_FRAG_COMPLETE)) { + if (!(fq->flags & INET_FRAG_COMPLETE)) { fq_unlink(fq, f); atomic_dec(&fq->refcnt); - fq->last_in |= INET_FRAG_COMPLETE; + fq->flags |= INET_FRAG_COMPLETE; } } EXPORT_SYMBOL(inet_frag_kill); static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, - struct sk_buff *skb) + struct sk_buff *skb) { if (f->skb_free) f->skb_free(skb); kfree_skb(skb); } -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, - int *work) +void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) { struct sk_buff *fp; struct netns_frags *nf; unsigned int sum, sum_truesize = 0; - WARN_ON(!(q->last_in & INET_FRAG_COMPLETE)); + WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); WARN_ON(del_timer(&q->timer) != 0); /* Release all fragment data. */ @@ -186,87 +330,32 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, fp = xp; } sum = sum_truesize + f->qsize; - if (work) - *work -= sum; sub_frag_mem_limit(q, sum); if (f->destructor) f->destructor(q); - kfree(q); - + kmem_cache_free(f->frags_cachep, q); } EXPORT_SYMBOL(inet_frag_destroy); -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) -{ - struct inet_frag_queue *q; - int work, evicted = 0; - - if (!force) { - if (frag_mem_limit(nf) <= nf->high_thresh) - return 0; - } - - work = frag_mem_limit(nf) - nf->low_thresh; - while (work > 0 || force) { - spin_lock(&nf->lru_lock); - - if (list_empty(&nf->lru_list)) { - spin_unlock(&nf->lru_lock); - break; - } - - q = list_first_entry(&nf->lru_list, - struct inet_frag_queue, lru_list); - atomic_inc(&q->refcnt); - /* Remove q from list to avoid several CPUs grabbing it */ - list_del_init(&q->lru_list); - - spin_unlock(&nf->lru_lock); - - spin_lock(&q->lock); - if (!(q->last_in & INET_FRAG_COMPLETE)) - inet_frag_kill(q, f); - spin_unlock(&q->lock); - - if (atomic_dec_and_test(&q->refcnt)) - inet_frag_destroy(q, f, &work); - evicted++; - } - - return evicted; -} -EXPORT_SYMBOL(inet_frag_evictor); - static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, - struct inet_frag_queue *qp_in, struct inet_frags *f, - void *arg) + struct inet_frag_queue *qp_in, + struct inet_frags *f, + void *arg) { - struct inet_frag_bucket *hb; + struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); struct inet_frag_queue *qp; - unsigned int hash; - - read_lock(&f->lock); /* Protects against hash rebuild */ - /* - * While we stayed w/o the lock other CPU could update - * the rnd seed, so we need to re-calculate the hash - * chain. Fortunatelly the qp_in can be used to get one. - */ - hash = f->hashfn(qp_in); - hb = &f->hash[hash]; - spin_lock(&hb->chain_lock); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because - * such entry could be created on other cpu, while we - * released the hash bucket lock. + * such entry could have been created on other cpu before + * we acquired hash bucket lock. */ hlist_for_each_entry(qp, &hb->chain, list) { if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); - qp_in->last_in |= INET_FRAG_COMPLETE; + qp_in->flags |= INET_FRAG_COMPLETE; inet_frag_put(qp_in, f); return qp; } @@ -278,19 +367,24 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, atomic_inc(&qp->refcnt); hlist_add_head(&qp->list, &hb->chain); - inet_frag_lru_add(nf, qp); + spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); return qp; } static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, - struct inet_frags *f, void *arg) + struct inet_frags *f, + void *arg) { struct inet_frag_queue *q; - q = kzalloc(f->qsize, GFP_ATOMIC); + if (frag_mem_limit(nf) > nf->high_thresh) { + inet_frag_schedule_worker(f); + return NULL; + } + + q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (q == NULL) return NULL; @@ -301,13 +395,13 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); - INIT_LIST_HEAD(&q->lru_list); return q; } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - struct inet_frags *f, void *arg) + struct inet_frags *f, + void *arg) { struct inet_frag_queue *q; @@ -319,13 +413,17 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, } struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, unsigned int hash) - __releases(&f->lock) + struct inet_frags *f, void *key, + unsigned int hash) { struct inet_frag_bucket *hb; struct inet_frag_queue *q; int depth = 0; + if (frag_mem_limit(nf) > nf->low_thresh) + inet_frag_schedule_worker(f); + + hash &= (INETFRAGS_HASHSZ - 1); hb = &f->hash[hash]; spin_lock(&hb->chain_lock); @@ -333,18 +431,22 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, if (q->net == nf && f->match(q, key)) { atomic_inc(&q->refcnt); spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); return q; } depth++; } spin_unlock(&hb->chain_lock); - read_unlock(&f->lock); if (depth <= INETFRAGS_MAXDEPTH) return inet_frag_create(nf, f, key); - else - return ERR_PTR(-ENOBUFS); + + if (inet_frag_may_rebuild(f)) { + if (!f->rebuild) + f->rebuild = true; + inet_frag_schedule_worker(f); + } + + return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 43116e8c8e13..9111a4e22155 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -229,7 +229,7 @@ begin: } } else if (score == hiscore && reuseport) { matches++; - if (((u64)phash * matches) >> 32 == 0) + if (reciprocal_scale(phash, matches) == 0) result = sk; phash = next_pseudo_random32(phash); } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index bd5f5928167d..241afd743d2c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -72,29 +72,10 @@ void inet_peer_base_init(struct inet_peer_base *bp) { bp->root = peer_avl_empty_rcu; seqlock_init(&bp->lock); - bp->flush_seq = ~0U; bp->total = 0; } EXPORT_SYMBOL_GPL(inet_peer_base_init); -static atomic_t v4_seq = ATOMIC_INIT(0); -static atomic_t v6_seq = ATOMIC_INIT(0); - -static atomic_t *inetpeer_seq_ptr(int family) -{ - return (family == AF_INET ? &v4_seq : &v6_seq); -} - -static inline void flush_check(struct inet_peer_base *base, int family) -{ - atomic_t *fp = inetpeer_seq_ptr(family); - - if (unlikely(base->flush_seq != atomic_read(fp))) { - inetpeer_invalidate_tree(base); - base->flush_seq = atomic_read(fp); - } -} - #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ /* Exported for sysctl_net_ipv4. */ @@ -444,8 +425,6 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, unsigned int sequence; int invalidated, gccnt = 0; - flush_check(base, daddr->family); - /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index ed32313e307c..2811cc18701a 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -55,6 +55,7 @@ */ static int sysctl_ipfrag_max_dist __read_mostly = 64; +static const char ip_frag_cache_name[] = "ip4-frags"; struct ipfrag_skb_cb { @@ -86,11 +87,6 @@ static inline u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; -int ip_frag_nqueues(struct net *net) -{ - return net->ipv4.frags.nqueues; -} - int ip_frag_mem(struct net *net) { return sum_frag_mem_limit(&net->ipv4.frags); @@ -109,21 +105,21 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); return jhash_3words((__force u32)id << 16 | prot, (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); + ip4_frags.rnd); } -static unsigned int ip4_hashfn(struct inet_frag_queue *q) +static unsigned int ip4_hashfn(const struct inet_frag_queue *q) { - struct ipq *ipq; + const struct ipq *ipq; ipq = container_of(q, struct ipq, q); return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); } -static bool ip4_frag_match(struct inet_frag_queue *q, void *a) +static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) { - struct ipq *qp; - struct ip4_create_arg *arg = a; + const struct ipq *qp; + const struct ip4_create_arg *arg = a; qp = container_of(q, struct ipq, q); return qp->id == arg->iph->id && @@ -133,14 +129,14 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a) qp->user == arg->user; } -static void ip4_frag_init(struct inet_frag_queue *q, void *a) +static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, frags); struct net *net = container_of(ipv4, struct net, ipv4); - struct ip4_create_arg *arg = a; + const struct ip4_create_arg *arg = a; qp->protocol = arg->iph->protocol; qp->id = arg->iph->id; @@ -177,18 +173,6 @@ static void ipq_kill(struct ipq *ipq) inet_frag_kill(&ipq->q, &ip4_frags); } -/* Memory limiting on fragments. Evictor trashes the oldest - * fragment queue until we are back under the threshold. - */ -static void ip_evictor(struct net *net) -{ - int evicted; - - evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false); - if (evicted) - IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); -} - /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ @@ -202,19 +186,22 @@ static void ip_expire(unsigned long arg) spin_lock(&qp->q.lock); - if (qp->q.last_in & INET_FRAG_COMPLETE) + if (qp->q.flags & INET_FRAG_COMPLETE) goto out; ipq_kill(qp); - - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { + if (!(qp->q.flags & INET_FRAG_EVICTED)) { struct sk_buff *head = qp->q.fragments; const struct iphdr *iph; int err; + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + + if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) + goto out; + rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) @@ -227,8 +214,7 @@ static void ip_expire(unsigned long arg) if (err) goto out_rcu_unlock; - /* - * Only an end host needs to send an ICMP + /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ if (qp->user == IP_DEFRAG_AF_PACKET || @@ -237,7 +223,6 @@ static void ip_expire(unsigned long arg) (skb_rtable(head)->rt_type != RTN_LOCAL))) goto out_rcu_unlock; - /* Send an ICMP "Fragment Reassembly Timeout" message. */ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); out_rcu_unlock: @@ -260,7 +245,6 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) arg.iph = iph; arg.user = user; - read_lock(&ip4_frags.lock); hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); @@ -319,7 +303,7 @@ static int ip_frag_reinit(struct ipq *qp) } while (fp); sub_frag_mem_limit(&qp->q, sum_truesize); - qp->q.last_in = 0; + qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.fragments = NULL; @@ -340,7 +324,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) int err = -ENOENT; u8 ecn; - if (qp->q.last_in & INET_FRAG_COMPLETE) + if (qp->q.flags & INET_FRAG_COMPLETE) goto err; if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && @@ -367,9 +351,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * or have different end, the segment is corrupted. */ if (end < qp->q.len || - ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) + ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) goto err; - qp->q.last_in |= INET_FRAG_LAST_IN; + qp->q.flags |= INET_FRAG_LAST_IN; qp->q.len = end; } else { if (end&7) { @@ -379,7 +363,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } if (end > qp->q.len) { /* Some bits beyond end -> corruption. */ - if (qp->q.last_in & INET_FRAG_LAST_IN) + if (qp->q.flags & INET_FRAG_LAST_IN) goto err; qp->q.len = end; } @@ -488,13 +472,13 @@ found: qp->ecn |= ecn; add_frag_mem_limit(&qp->q, skb->truesize); if (offset == 0) - qp->q.last_in |= INET_FRAG_FIRST_IN; + qp->q.flags |= INET_FRAG_FIRST_IN; if (ip_hdr(skb)->frag_off & htons(IP_DF) && skb->len + ihl > qp->q.max_size) qp->q.max_size = skb->len + ihl; - if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { unsigned long orefdst = skb->_skb_refdst; @@ -505,7 +489,6 @@ found: } skb_dst_drop(skb); - inet_frag_lru_move(&qp->q); return -EINPROGRESS; err: @@ -655,9 +638,6 @@ int ip_defrag(struct sk_buff *skb, u32 user) net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); - /* Start by cleaning up the memory. */ - ip_evictor(net); - /* Lookup (or create) queue header */ if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { int ret; @@ -721,14 +701,17 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ipv4.frags.high_thresh }, { .procname = "ipfrag_time", @@ -740,10 +723,12 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { { } }; +/* secret interval has been deprecated */ +static int ip4_frags_secret_interval_unused; static struct ctl_table ip4_frags_ctl_table[] = { { .procname = "ipfrag_secret_interval", - .data = &ip4_frags.secret_interval, + .data = &ip4_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -771,7 +756,10 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) goto err_alloc; table[0].data = &net->ipv4.frags.high_thresh; + table[0].extra1 = &net->ipv4.frags.low_thresh; + table[0].extra2 = &init_net.ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; + table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; /* Don't export sysctls to unprivileged users */ @@ -802,7 +790,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) kfree(table); } -static void ip4_frags_ctl_register(void) +static void __init ip4_frags_ctl_register(void) { register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } @@ -816,7 +804,7 @@ static inline void ip4_frags_ns_ctl_unregister(struct net *net) { } -static inline void ip4_frags_ctl_register(void) +static inline void __init ip4_frags_ctl_register(void) { } #endif @@ -873,6 +861,7 @@ void __init ipfrag_init(void) ip4_frags.qsize = sizeof(struct ipq); ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; - ip4_frags.secret_interval = 10 * 60 * HZ; - inet_frags_init(&ip4_frags); + ip4_frags.frags_cache_name = ip_frag_cache_name; + if (inet_frags_init(&ip4_frags)) + panic("IP: failed to allocate ip4_frags cache\n"); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9b842544aea3..12055fdbe716 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -239,7 +239,9 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, tpi.seq = htonl(tunnel->o_seqno); /* Push GRE header. */ - gre_build_header(skb, &tpi, tunnel->hlen); + gre_build_header(skb, &tpi, tunnel->tun_hlen); + + skb_set_inner_protocol(skb, tpi.proto); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } @@ -310,7 +312,7 @@ out: static int ipgre_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - int err = 0; + int err; struct ip_tunnel_parm p; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) @@ -470,13 +472,18 @@ static void ipgre_tunnel_setup(struct net_device *dev) static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; + int t_hlen; tunnel = netdev_priv(dev); - tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); + tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); tunnel->parms.iph.protocol = IPPROTO_GRE; - dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; - dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + + t_hlen = tunnel->hlen + sizeof(struct iphdr); + + dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; + dev->mtu = ETH_DATA_LEN - t_hlen - 4; dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -503,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->addr_len = 4; if (iph->daddr) { @@ -628,6 +635,40 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], parms->iph.frag_off = htons(IP_DF); } +/* This function returns true when ENCAP attributes are present in the nl msg */ +static bool ipgre_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_GRE_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); + } + + if (data[IFLA_GRE_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); + } + + if (data[IFLA_GRE_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_u16(data[IFLA_GRE_ENCAP_SPORT]); + } + + if (data[IFLA_GRE_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_u16(data[IFLA_GRE_ENCAP_DPORT]); + } + + return ret; +} + static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); @@ -657,6 +698,15 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipgre_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipgre_netlink_parms(data, tb, &p); return ip_tunnel_newlink(dev, tb, &p); @@ -666,6 +716,15 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipgre_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipgre_netlink_parms(data, tb, &p); return ip_tunnel_changelink(dev, tb, &p); @@ -694,6 +753,14 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_GRE_PMTUDISC */ nla_total_size(1) + + /* IFLA_GRE_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_DPORT */ + nla_total_size(2) + 0; } @@ -714,6 +781,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)))) goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, + t->encap.type) || + nla_put_u16(skb, IFLA_GRE_ENCAP_SPORT, + t->encap.sport) || + nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT, + t->encap.dport) || + nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, + t->encap.dport)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -731,6 +809,10 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_TTL] = { .type = NLA_U8 }, [IFLA_GRE_TOS] = { .type = NLA_U8 }, [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, + [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ad382499bace..5b3d91be2db0 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -87,17 +87,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, * NOTE: dopt cannot point to skb. */ -int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) +int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, + const struct ip_options *sopt) { - const struct ip_options *sopt; unsigned char *sptr, *dptr; int soffset, doffset; int optlen; memset(dopt, 0, sizeof(struct ip_options)); - sopt = &(IPCB(skb)->opt); - if (sopt->optlen == 0) return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8d3b6b0e9857..e35b71289156 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -516,7 +516,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge) mtu -= nf_bridge_mtu_reduction(skb); #endif @@ -855,11 +855,15 @@ static int __ip_append_data(struct sock *sk, unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; + u32 tskey = 0; skb = skb_peek_tail(queue); exthdrlen = !skb ? rt->dst.header_len : 0; mtu = cork->fragsize; + if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; hh_len = LL_RESERVED_SPACE(rt->dst.dev); @@ -962,10 +966,6 @@ alloc_new_skb: sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; - else - /* only the initial fragment is - time stamped */ - cork->tx_flags = 0; } if (skb == NULL) goto error; @@ -976,7 +976,12 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); + + /* only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; /* * Find where to start putting bytes. @@ -1517,8 +1522,10 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { .uc_ttl = -1, }; -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, - __be32 saddr, const struct ip_reply_arg *arg, +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, + const struct ip_options *sopt, + __be32 daddr, __be32 saddr, + const struct ip_reply_arg *arg, unsigned int len) { struct ip_options_data replyopts; @@ -1529,7 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, struct sock *sk; struct inet_sock *inet; - if (ip_options_echo(&replyopts.opt.opt, skb)) + if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) return; ipc.addr = daddr; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 64741b938632..c373a9ad4555 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -303,7 +303,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, } /* dont let ip_call_ra_chain() use sk again */ ra->sk = NULL; - rcu_assign_pointer(*rap, ra->next); + RCU_INIT_POINTER(*rap, ra->next); spin_unlock_bh(&ip_ra_lock); if (ra->destructor) @@ -325,7 +325,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra->sk = sk; new_ra->destructor = destructor; - new_ra->next = ra; + RCU_INIT_POINTER(new_ra->next, ra); rcu_assign_pointer(*rap, new_ra); sock_hold(sk); spin_unlock_bh(&ip_ra_lock); @@ -405,7 +405,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; + struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct { struct sock_extended_err ee; @@ -415,7 +415,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int copied; err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; @@ -462,17 +462,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_ERRQUEUE; err = copied; - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - skb2 = skb_peek(&sk->sk_error_queue); - if (skb2 != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else - spin_unlock_bh(&sk->sk_error_queue.lock); - out_free_skb: kfree_skb(skb); out: @@ -1319,7 +1308,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control = optval; + msg.msg_control = (__force void *) optval; msg.msg_controllen = len; msg.msg_flags = flags; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 6f9de61dce5f..0bb8e141eacc 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -55,6 +55,8 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> +#include <net/udp.h> +#include <net/gue.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -69,23 +71,25 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) } static void __tunnel_dst_set(struct ip_tunnel_dst *idst, - struct dst_entry *dst) + struct dst_entry *dst, __be32 saddr) { struct dst_entry *old_dst; dst_clone(dst); old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); dst_release(old_dst); + idst->saddr = saddr; } -static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) +static noinline void tunnel_dst_set(struct ip_tunnel *t, + struct dst_entry *dst, __be32 saddr) { - __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst); + __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr); } static void tunnel_dst_reset(struct ip_tunnel *t) { - tunnel_dst_set(t, NULL); + tunnel_dst_set(t, NULL, 0); } void ip_tunnel_dst_reset_all(struct ip_tunnel *t) @@ -93,20 +97,25 @@ void ip_tunnel_dst_reset_all(struct ip_tunnel *t) int i; for_each_possible_cpu(i) - __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); + __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0); } EXPORT_SYMBOL(ip_tunnel_dst_reset_all); -static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) +static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, + u32 cookie, __be32 *saddr) { + struct ip_tunnel_dst *idst; struct dst_entry *dst; rcu_read_lock(); - dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); + idst = raw_cpu_ptr(t->dst_cache); + dst = rcu_dereference(idst->dst); if (dst && !atomic_inc_not_zero(&dst->__refcnt)) dst = NULL; if (dst) { - if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + if (!dst->obsolete || dst->ops->check(dst, cookie)) { + *saddr = idst->saddr; + } else { tunnel_dst_reset(t); dst_release(dst); dst = NULL; @@ -305,7 +314,7 @@ static struct net_device *__ip_tunnel_create(struct net *net, } ASSERT_RTNL(); - dev = alloc_netdev(ops->priv_size, name, ops->setup); + dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); if (!dev) { err = -ENOMEM; goto failed; @@ -367,7 +376,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) if (!IS_ERR(rt)) { tdev = rt->dst.dev; - tunnel_dst_set(tunnel, &rt->dst); + tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) @@ -480,6 +489,103 @@ drop: } EXPORT_SYMBOL_GPL(ip_tunnel_rcv); +static int ip_encap_hlen(struct ip_tunnel_encap *e) +{ + switch (e->type) { + case TUNNEL_ENCAP_NONE: + return 0; + case TUNNEL_ENCAP_FOU: + return sizeof(struct udphdr); + case TUNNEL_ENCAP_GUE: + return sizeof(struct udphdr) + sizeof(struct guehdr); + default: + return -EINVAL; + } +} + +int ip_tunnel_encap_setup(struct ip_tunnel *t, + struct ip_tunnel_encap *ipencap) +{ + int hlen; + + memset(&t->encap, 0, sizeof(t->encap)); + + hlen = ip_encap_hlen(ipencap); + if (hlen < 0) + return hlen; + + t->encap.type = ipencap->type; + t->encap.sport = ipencap->sport; + t->encap.dport = ipencap->dport; + t->encap.flags = ipencap->flags; + + t->encap_hlen = hlen; + t->hlen = t->encap_hlen + t->tun_hlen; + + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); + +static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + size_t hdr_len, u8 *protocol, struct flowi4 *fl4) +{ + struct udphdr *uh; + __be16 sport; + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + /* Get length and hash before making space in skb */ + + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + + skb_push(skb, hdr_len); + + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + if (e->type == TUNNEL_ENCAP_GUE) { + struct guehdr *guehdr = (struct guehdr *)&uh[1]; + + guehdr->version = 0; + guehdr->hlen = 0; + guehdr->flags = 0; + guehdr->next_hdr = *protocol; + } + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + uh->check = 0; + udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, + fl4->saddr, fl4->daddr, skb->len); + + *protocol = IPPROTO_UDP; + + return 0; +} + +int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, + u8 *protocol, struct flowi4 *fl4) +{ + switch (t->encap.type) { + case TUNNEL_ENCAP_NONE: + return 0; + case TUNNEL_ENCAP_FOU: + case TUNNEL_ENCAP_GUE: + return fou_build_header(skb, &t->encap, t->encap_hlen, + protocol, fl4); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(ip_tunnel_encap); + static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df) { @@ -529,7 +635,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, } void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, - const struct iphdr *tnl_params, const u8 protocol) + const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *inner_iph; @@ -610,7 +716,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); - rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL; + if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) + goto tx_error; + + rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); @@ -620,7 +729,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } if (connected) - tunnel_dst_set(tunnel, &rt->dst); + tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); } if (rt->dst.dev == dev) { @@ -663,7 +772,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, df |= (inner_iph->frag_off&htons(IP_DF)); max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) - + rt->dst.header_len; + + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); if (max_headroom > dev->needed_headroom) dev->needed_headroom = max_headroom; @@ -757,9 +866,14 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); - if (!t && (cmd == SIOCADDTUNNEL)) { - t = ip_tunnel_create(net, itn, p); - err = PTR_ERR_OR_ZERO(t); + if (cmd == SIOCADDTUNNEL) { + if (!t) { + t = ip_tunnel_create(net, itn, p); + err = PTR_ERR_OR_ZERO(t); + break; + } + + err = -EEXIST; break; } if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index b8960f3527f3..3e861011e4a3 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } @@ -534,40 +534,28 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = { static int __init vti_init(void) { + const char *msg; int err; - pr_info("IPv4 over IPSec tunneling driver\n"); + pr_info("IPv4 over IPsec tunneling driver\n"); + msg = "tunnel device"; err = register_pernet_device(&vti_net_ops); if (err < 0) - return err; - err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); - if (err < 0) { - unregister_pernet_device(&vti_net_ops); - pr_info("vti init: can't register tunnel\n"); - - return err; - } + goto pernet_dev_failed; + msg = "tunnel protocols"; + err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); - if (err < 0) { - xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); - unregister_pernet_device(&vti_net_ops); - pr_info("vti init: can't register tunnel\n"); - - return err; - } - + if (err < 0) + goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); - if (err < 0) { - xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); - xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); - unregister_pernet_device(&vti_net_ops); - pr_info("vti init: can't register tunnel\n"); - - return err; - } + if (err < 0) + goto xfrm_proto_comp_failed; + msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); if (err < 0) goto rtnl_link_failed; @@ -576,23 +564,23 @@ static int __init vti_init(void) rtnl_link_failed: xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); +xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: unregister_pernet_device(&vti_net_ops); +pernet_dev_failed: + pr_err("vti init: failed to register %s\n", msg); return err; } static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); - if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP)) - pr_info("vti close: can't deregister tunnel\n"); - if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH)) - pr_info("vti close: can't deregister tunnel\n"); - if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP)) - pr_info("vti close: can't deregister tunnel\n"); - - + xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); + xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); } diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index b3e86ea7b71b..648fa1490ea7 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -143,8 +143,6 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */ __be32 root_server_addr = NONE; /* Address of NFS server */ u8 root_server_path[256] = { 0, }; /* Path to mount as root */ -__be32 ic_dev_xid; /* Device under configuration */ - /* vendor class identifier */ static char vendor_class_identifier[253] __initdata; @@ -264,7 +262,8 @@ static int __init ic_open_devs(void) /* wait for a carrier on at least one device */ start = jiffies; next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); - while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { + while (time_before(jiffies, start + + msecs_to_jiffies(CONF_CARRIER_TIMEOUT))) { int wait, elapsed; for_each_netdev(&init_net, dev) @@ -654,6 +653,7 @@ static struct packet_type bootp_packet_type __initdata = { .func = ic_bootp_recv, }; +static __be32 ic_dev_xid; /* Device under configuration */ /* * Initialize DHCP/BOOTP extension fields in the request. @@ -1218,10 +1218,10 @@ static int __init ic_dynamic(void) get_random_bytes(&timeout, sizeof(timeout)); timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM); for (;;) { +#ifdef IPCONFIG_BOOTP /* Track the device we are configuring */ ic_dev_xid = d->xid; -#ifdef IPCONFIG_BOOTP if (do_bootp && (d->able & IC_BOOTP)) ic_bootp_send_if(d, jiffies - start_jiffies); #endif diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 62eaa005e146..37096d64730e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -224,6 +224,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (IS_ERR(skb)) goto out; + skb_set_inner_ipproto(skb, IPPROTO_IPIP); + ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); return NETDEV_TX_OK; @@ -287,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->features |= IPIP_FEATURES; dev->hw_features |= IPIP_FEATURES; @@ -301,7 +303,8 @@ static int ipip_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - tunnel->hlen = 0; + tunnel->tun_hlen = 0; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; tunnel->parms.iph.protocol = IPPROTO_IPIP; return ip_tunnel_init(dev); } @@ -340,10 +343,53 @@ static void ipip_netlink_parms(struct nlattr *data[], parms->iph.frag_off = htons(IP_DF); } +/* This function returns true when ENCAP attributes are present in the nl msg */ +static bool ipip_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_IPTUN_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); + } + + if (data[IFLA_IPTUN_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); + } + + if (data[IFLA_IPTUN_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]); + } + + if (data[IFLA_IPTUN_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]); + } + + return ret; +} + static int ipip_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipip_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipip_netlink_parms(data, &p); return ip_tunnel_newlink(dev, tb, &p); @@ -353,6 +399,15 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipip_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipip_netlink_parms(data, &p); @@ -378,6 +433,14 @@ static size_t ipip_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_IPTUN_PMTUDISC */ nla_total_size(1) + + /* IFLA_IPTUN_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_DPORT */ + nla_total_size(2) + 0; } @@ -394,6 +457,17 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF)))) goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, + tunnel->encap.type) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT, + tunnel->encap.sport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, + tunnel->encap.dport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, + tunnel->encap.dport)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -407,6 +481,10 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, + [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipip_link_ops __read_mostly = { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 65bcaa789043..c8034587859d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -500,7 +500,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) else sprintf(name, "pimreg%u", mrt->id); - dev = alloc_netdev(0, name, reg_vif_setup); + dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (dev == NULL) return NULL; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a26ce035e3fa..4c019d5c3f57 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,6 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. +config NF_LOG_ARP + tristate "ARP packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + +config NF_LOG_IPV4 + tristate "IPv4 packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + config NF_TABLES_IPV4 depends on NF_TABLES tristate "IPv4 nf_tables support" @@ -51,9 +61,36 @@ config NFT_CHAIN_ROUTE_IPV4 fields such as the source, destination, type of service and the packet mark. +config NF_REJECT_IPV4 + tristate "IPv4 packet rejection" + default m if NETFILTER_ADVANCED=n + +config NFT_REJECT_IPV4 + depends on NF_TABLES_IPV4 + select NF_REJECT_IPV4 + default NFT_REJECT + tristate + +config NF_TABLES_ARP + depends on NF_TABLES + tristate "ARP nf_tables support" + help + This option enables the ARP support for nf_tables. + +config NF_NAT_IPV4 + tristate "IPv4 NAT" + depends on NF_CONNTRACK_IPV4 + default m if NETFILTER_ADVANCED=n + select NF_NAT + help + The IPv4 NAT option allows masquerading, port forwarding and other + forms of full Network Address Port Translation. This can be + controlled by iptables or nft. + +if NF_NAT_IPV4 + config NFT_CHAIN_NAT_IPV4 depends on NF_TABLES_IPV4 - depends on NF_NAT_IPV4 && NFT_NAT tristate "IPv4 nf_tables nat chain support" help This option enables the "nat" chain for IPv4 in nf_tables. This @@ -61,16 +98,54 @@ config NFT_CHAIN_NAT_IPV4 packet transformations such as the source, destination address and source and destination ports. -config NFT_REJECT_IPV4 +config NF_NAT_MASQUERADE_IPV4 + tristate "IPv4 masquerade support" + help + This is the kernel functionality to provide NAT in the masquerade + flavour (automatic source address selection). + +config NFT_MASQ_IPV4 + tristate "IPv4 masquerading support for nf_tables" depends on NF_TABLES_IPV4 - default NFT_REJECT + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV4 + help + This is the expression that provides IPv4 masquerading support for + nf_tables. + +config NF_NAT_SNMP_BASIC + tristate "Basic SNMP-ALG support" + depends on NF_CONNTRACK_SNMP + depends on NETFILTER_ADVANCED + default NF_NAT && NF_CONNTRACK_SNMP + ---help--- + + This module implements an Application Layer Gateway (ALG) for + SNMP payloads. In conjunction with NAT, it allows a network + management system to access multiple private networks with + conflicting addresses. It works by modifying IP addresses + inside SNMP payloads to match IP-layer NAT mapping. + + This is the "basic" form of SNMP-ALG, as described in RFC 2962 + + To compile it as a module, choose M here. If unsure, say N. + +config NF_NAT_PROTO_GRE tristate + depends on NF_CT_PROTO_GRE -config NF_TABLES_ARP - depends on NF_TABLES - tristate "ARP nf_tables support" - help - This option enables the ARP support for nf_tables. +config NF_NAT_PPTP + tristate + depends on NF_CONNTRACK + default NF_CONNTRACK_PPTP + select NF_NAT_PROTO_GRE + +config NF_NAT_H323 + tristate + depends on NF_CONNTRACK + default NF_CONNTRACK_H323 + +endif # NF_NAT_IPV4 config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" @@ -138,6 +213,7 @@ config IP_NF_FILTER config IP_NF_TARGET_REJECT tristate "REJECT target support" depends on IP_NF_FILTER + select NF_REJECT_IPV4 default m if NETFILTER_ADVANCED=n help The REJECT target allows a filtering rule to specify that an ICMP @@ -159,42 +235,26 @@ config IP_NF_TARGET_SYNPROXY To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_ULOG - tristate "ULOG target support (obsolete)" - default m if NETFILTER_ADVANCED=n - ---help--- - - This option enables the old IPv4-only "ipt_ULOG" implementation - which has been obsoleted by the new "nfnetlink_log" code (see - CONFIG_NETFILTER_NETLINK_LOG). - - This option adds a `ULOG' target, which allows you to create rules in - any iptables table. The packet is passed to a userspace logging - daemon using netlink multicast sockets; unlike the LOG target - which can only be viewed through syslog. - - The appropriate userspace logging daemon (ulogd) may be obtained from - <http://www.netfilter.org/projects/ulogd/index.html> - - To compile it as a module, choose M here. If unsure, say N. - # NAT + specific targets: nf_conntrack -config NF_NAT_IPV4 - tristate "IPv4 NAT" +config IP_NF_NAT + tristate "iptables NAT support" depends on NF_CONNTRACK_IPV4 default m if NETFILTER_ADVANCED=n select NF_NAT + select NF_NAT_IPV4 + select NETFILTER_XT_NAT help - The IPv4 NAT option allows masquerading, port forwarding and other - forms of full Network Address Port Translation. It is controlled by - the `nat' table in iptables: see the man page for iptables(8). + This enables the `nat' table in iptables. This allows masquerading, + port forwarding and other forms of full Network Address Port + Translation. To compile it as a module, choose M here. If unsure, say N. -if NF_NAT_IPV4 +if IP_NF_NAT config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" + select NF_NAT_MASQUERADE_IPV4 default m if NETFILTER_ADVANCED=n help Masquerading is a special case of NAT: all outgoing connections are @@ -223,47 +283,7 @@ config IP_NF_TARGET_REDIRECT (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_TARGET_REDIRECT. -endif - -config NF_NAT_SNMP_BASIC - tristate "Basic SNMP-ALG support" - depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4 - depends on NETFILTER_ADVANCED - default NF_NAT && NF_CONNTRACK_SNMP - ---help--- - - This module implements an Application Layer Gateway (ALG) for - SNMP payloads. In conjunction with NAT, it allows a network - management system to access multiple private networks with - conflicting addresses. It works by modifying IP addresses - inside SNMP payloads to match IP-layer NAT mapping. - - This is the "basic" form of SNMP-ALG, as described in RFC 2962 - - To compile it as a module, choose M here. If unsure, say N. - -# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y), -# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker. -# From kconfig-language.txt: -# -# <expr> '&&' <expr> (6) -# -# (6) Returns the result of min(/expr/, /expr/). - -config NF_NAT_PROTO_GRE - tristate - depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE - -config NF_NAT_PPTP - tristate - depends on NF_CONNTRACK && NF_NAT_IPV4 - default NF_NAT_IPV4 && NF_CONNTRACK_PPTP - select NF_NAT_PROTO_GRE - -config NF_NAT_H323 - tristate - depends on NF_CONNTRACK && NF_NAT_IPV4 - default NF_NAT_IPV4 && NF_CONNTRACK_H323 +endif # IP_NF_NAT # mangle + specific targets config IP_NF_MANGLE diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 90b82405331e..f4cef5af0969 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -19,10 +19,18 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o # defrag obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o +# logging +obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o +obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o + +# reject +obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o + # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o +obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o @@ -31,6 +39,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o +obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables @@ -39,7 +48,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o -obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o +obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o @@ -53,7 +62,6 @@ obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o -obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o # generic ARP tables obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2510c02c2d21..e90f83a3415b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -285,7 +285,7 @@ clusterip_hashfn(const struct sk_buff *skb, } /* node numbers are 1..n, not 0..n */ - return (((u64)hashval * config->num_total_nodes) >> 32) + 1; + return reciprocal_scale(hashval, config->num_total_nodes) + 1; } static inline int diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 00352ce0f0de..da7f02a0b868 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -22,6 +22,7 @@ #include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_nat.h> +#include <net/netfilter/ipv4/nf_nat_masquerade.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -46,103 +47,17 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) static unsigned int masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) { - struct nf_conn *ct; - struct nf_conn_nat *nat; - enum ip_conntrack_info ctinfo; - struct nf_nat_range newrange; + struct nf_nat_range range; const struct nf_nat_ipv4_multi_range_compat *mr; - const struct rtable *rt; - __be32 newsrc, nh; - - NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); - - ct = nf_ct_get(skb, &ctinfo); - nat = nfct_nat(ct); - - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY)); - - /* Source address is 0.0.0.0 - locally generated packet that is - * probably not supposed to be masqueraded. - */ - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) - return NF_ACCEPT; mr = par->targinfo; - rt = skb_rtable(skb); - nh = rt_nexthop(rt, ip_hdr(skb)->daddr); - newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE); - if (!newsrc) { - pr_info("%s ate my IP address\n", par->out->name); - return NF_DROP; - } - - nat->masq_index = par->out->ifindex; - - /* Transfer from original range. */ - memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); - memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); - newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.ip = newsrc; - newrange.max_addr.ip = newsrc; - newrange.min_proto = mr->range[0].min; - newrange.max_proto = mr->range[0].max; + range.flags = mr->range[0].flags; + range.min_proto = mr->range[0].min; + range.max_proto = mr->range[0].max; - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); + return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out); } -static int -device_cmp(struct nf_conn *i, void *ifindex) -{ - const struct nf_conn_nat *nat = nfct_nat(i); - - if (!nat) - return 0; - if (nf_ct_l3num(i) != NFPROTO_IPV4) - return 0; - return nat->masq_index == (int)(long)ifindex; -} - -static int masq_device_event(struct notifier_block *this, - unsigned long event, - void *ptr) -{ - const struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net *net = dev_net(dev); - - if (event == NETDEV_DOWN) { - /* Device was downed. Search entire table for - conntracks which were associated with that device, - and forget them. */ - NF_CT_ASSERT(dev->ifindex != 0); - - nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); - } - - return NOTIFY_DONE; -} - -static int masq_inet_event(struct notifier_block *this, - unsigned long event, - void *ptr) -{ - struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; - struct netdev_notifier_info info; - - netdev_notifier_info_init(&info, dev); - return masq_device_event(this, event, &info); -} - -static struct notifier_block masq_dev_notifier = { - .notifier_call = masq_device_event, -}; - -static struct notifier_block masq_inet_notifier = { - .notifier_call = masq_inet_event, -}; - static struct xt_target masquerade_tg_reg __read_mostly = { .name = "MASQUERADE", .family = NFPROTO_IPV4, @@ -160,12 +75,8 @@ static int __init masquerade_tg_init(void) ret = xt_register_target(&masquerade_tg_reg); - if (ret == 0) { - /* Register for device down reports */ - register_netdevice_notifier(&masq_dev_notifier); - /* Register IP address change reports */ - register_inetaddr_notifier(&masq_inet_notifier); - } + if (ret == 0) + nf_nat_masquerade_ipv4_register_notifier(); return ret; } @@ -173,8 +84,7 @@ static int __init masquerade_tg_init(void) static void __exit masquerade_tg_exit(void) { xt_unregister_target(&masquerade_tg_reg); - unregister_netdevice_notifier(&masq_dev_notifier); - unregister_inetaddr_notifier(&masq_inet_notifier); + nf_nat_masquerade_ipv4_unregister_notifier(); } module_init(masquerade_tg_init); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 5b6e0df4ccff..8f48f5517e33 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -20,7 +20,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_REJECT.h> -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include <linux/netfilter_bridge.h> #endif diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c deleted file mode 100644 index 9cb993cd224b..000000000000 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ /dev/null @@ -1,498 +0,0 @@ -/* - * netfilter module for userspace packet logging daemons - * - * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2005-2007 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This module accepts two parameters: - * - * nlbufsiz: - * The parameter specifies how big the buffer for each netlink multicast - * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will - * get accumulated in the kernel until they are sent to userspace. It is - * NOT possible to allocate more than 128kB, and it is strongly discouraged, - * because atomically allocating 128kB inside the network rx softirq is not - * reliable. Please also keep in mind that this buffer size is allocated for - * each nlgroup you are using, so the total kernel memory usage increases - * by that factor. - * - * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since - * nlbufsiz is used with alloc_skb, which adds another - * sizeof(struct skb_shared_info). Use NLMSG_GOODSIZE instead. - * - * flushtimeout: - * Specify, after how many hundredths of a second the queue should be - * flushed even if it is not full yet. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/socket.h> -#include <linux/slab.h> -#include <linux/skbuff.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <net/netlink.h> -#include <linux/netdevice.h> -#include <linux/mm.h> -#include <linux/moduleparam.h> -#include <linux/netfilter.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ipt_ULOG.h> -#include <net/netfilter/nf_log.h> -#include <net/netns/generic.h> -#include <net/sock.h> -#include <linux/bitops.h> -#include <asm/unaligned.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); - -#define ULOG_NL_EVENT 111 /* Harald's favorite number */ -#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ - -static unsigned int nlbufsiz = NLMSG_GOODSIZE; -module_param(nlbufsiz, uint, 0400); -MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); - -static unsigned int flushtimeout = 10; -module_param(flushtimeout, uint, 0600); -MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); - -static bool nflog = true; -module_param(nflog, bool, 0400); -MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); - -/* global data structures */ - -typedef struct { - unsigned int qlen; /* number of nlmsgs' in the skb */ - struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ - struct sk_buff *skb; /* the pre-allocated skb */ - struct timer_list timer; /* the timer function */ -} ulog_buff_t; - -static int ulog_net_id __read_mostly; -struct ulog_net { - unsigned int nlgroup[ULOG_MAXNLGROUPS]; - ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; - struct sock *nflognl; - spinlock_t lock; -}; - -static struct ulog_net *ulog_pernet(struct net *net) -{ - return net_generic(net, ulog_net_id); -} - -/* send one ulog_buff_t to userspace */ -static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum) -{ - ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum]; - - pr_debug("ulog_send: timer is deleting\n"); - del_timer(&ub->timer); - - if (!ub->skb) { - pr_debug("ulog_send: nothing to send\n"); - return; - } - - /* last nlmsg needs NLMSG_DONE */ - if (ub->qlen > 1) - ub->lastnlh->nlmsg_type = NLMSG_DONE; - - NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; - pr_debug("throwing %d packets to netlink group %u\n", - ub->qlen, nlgroupnum + 1); - netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1, - GFP_ATOMIC); - - ub->qlen = 0; - ub->skb = NULL; - ub->lastnlh = NULL; -} - - -/* timer function to flush queue in flushtimeout time */ -static void ulog_timer(unsigned long data) -{ - unsigned int groupnum = *((unsigned int *)data); - struct ulog_net *ulog = container_of((void *)data, - struct ulog_net, - nlgroup[groupnum]); - pr_debug("timer function called, calling ulog_send\n"); - - /* lock to protect against somebody modifying our structure - * from ipt_ulog_target at the same time */ - spin_lock_bh(&ulog->lock); - ulog_send(ulog, groupnum); - spin_unlock_bh(&ulog->lock); -} - -static struct sk_buff *ulog_alloc_skb(unsigned int size) -{ - struct sk_buff *skb; - unsigned int n; - - /* alloc skb which should be big enough for a whole - * multipart message. WARNING: has to be <= 131000 - * due to slab allocator restrictions */ - - n = max(size, nlbufsiz); - skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); - if (!skb) { - if (n > size) { - /* try to allocate only as much as we need for - * current packet */ - - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - pr_debug("cannot even allocate %ub\n", size); - } - } - - return skb; -} - -static void ipt_ulog_packet(struct net *net, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct ipt_ulog_info *loginfo, - const char *prefix) -{ - ulog_buff_t *ub; - ulog_packet_msg_t *pm; - size_t size, copy_len; - struct nlmsghdr *nlh; - struct timeval tv; - struct ulog_net *ulog = ulog_pernet(net); - - /* ffs == find first bit set, necessary because userspace - * is already shifting groupnumber, but we need unshifted. - * ffs() returns [1..32], we need [0..31] */ - unsigned int groupnum = ffs(loginfo->nl_group) - 1; - - /* calculate the size of the skb needed */ - if (loginfo->copy_range == 0 || loginfo->copy_range > skb->len) - copy_len = skb->len; - else - copy_len = loginfo->copy_range; - - size = nlmsg_total_size(sizeof(*pm) + copy_len); - - ub = &ulog->ulog_buffers[groupnum]; - - spin_lock_bh(&ulog->lock); - - if (!ub->skb) { - if (!(ub->skb = ulog_alloc_skb(size))) - goto alloc_failure; - } else if (ub->qlen >= loginfo->qthreshold || - size > skb_tailroom(ub->skb)) { - /* either the queue len is too high or we don't have - * enough room in nlskb left. send it to userspace. */ - - ulog_send(ulog, groupnum); - - if (!(ub->skb = ulog_alloc_skb(size))) - goto alloc_failure; - } - - pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); - - nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, - sizeof(*pm)+copy_len, 0); - if (!nlh) { - pr_debug("error during nlmsg_put\n"); - goto out_unlock; - } - ub->qlen++; - - pm = nlmsg_data(nlh); - memset(pm, 0, sizeof(*pm)); - - /* We might not have a timestamp, get one */ - if (skb->tstamp.tv64 == 0) - __net_timestamp((struct sk_buff *)skb); - - /* copy hook, prefix, timestamp, payload, etc. */ - pm->data_len = copy_len; - tv = ktime_to_timeval(skb->tstamp); - put_unaligned(tv.tv_sec, &pm->timestamp_sec); - put_unaligned(tv.tv_usec, &pm->timestamp_usec); - put_unaligned(skb->mark, &pm->mark); - pm->hook = hooknum; - if (prefix != NULL) { - strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1); - pm->prefix[sizeof(pm->prefix) - 1] = '\0'; - } - else if (loginfo->prefix[0] != '\0') - strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); - - if (in && in->hard_header_len > 0 && - skb->mac_header != skb->network_header && - in->hard_header_len <= ULOG_MAC_LEN) { - memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); - pm->mac_len = in->hard_header_len; - } else - pm->mac_len = 0; - - if (in) - strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); - - if (out) - strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); - - /* copy_len <= skb->len, so can't fail. */ - if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) - BUG(); - - /* check if we are building multi-part messages */ - if (ub->qlen > 1) - ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; - - ub->lastnlh = nlh; - - /* if timer isn't already running, start it */ - if (!timer_pending(&ub->timer)) { - ub->timer.expires = jiffies + flushtimeout * HZ / 100; - add_timer(&ub->timer); - } - - /* if threshold is reached, send message to userspace */ - if (ub->qlen >= loginfo->qthreshold) { - if (loginfo->qthreshold > 1) - nlh->nlmsg_type = NLMSG_DONE; - ulog_send(ulog, groupnum); - } -out_unlock: - spin_unlock_bh(&ulog->lock); - - return; - -alloc_failure: - pr_debug("Error building netlink message\n"); - spin_unlock_bh(&ulog->lock); -} - -static unsigned int -ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - - ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out, - par->targinfo, NULL); - return XT_CONTINUE; -} - -static void ipt_logfn(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *li, - const char *prefix) -{ - struct ipt_ulog_info loginfo; - - if (!li || li->type != NF_LOG_TYPE_ULOG) { - loginfo.nl_group = ULOG_DEFAULT_NLGROUP; - loginfo.copy_range = 0; - loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD; - loginfo.prefix[0] = '\0'; - } else { - loginfo.nl_group = li->u.ulog.group; - loginfo.copy_range = li->u.ulog.copy_len; - loginfo.qthreshold = li->u.ulog.qthreshold; - strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); - } - - ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); -} - -static int ulog_tg_check(const struct xt_tgchk_param *par) -{ - const struct ipt_ulog_info *loginfo = par->targinfo; - - if (!par->net->xt.ulog_warn_deprecated) { - pr_info("ULOG is deprecated and it will be removed soon, " - "use NFLOG instead\n"); - par->net->xt.ulog_warn_deprecated = true; - } - - if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { - pr_debug("prefix not null-terminated\n"); - return -EINVAL; - } - if (loginfo->qthreshold > ULOG_MAX_QLEN) { - pr_debug("queue threshold %Zu > MAX_QLEN\n", - loginfo->qthreshold); - return -EINVAL; - } - return 0; -} - -#ifdef CONFIG_COMPAT -struct compat_ipt_ulog_info { - compat_uint_t nl_group; - compat_size_t copy_range; - compat_size_t qthreshold; - char prefix[ULOG_PREFIX_LEN]; -}; - -static void ulog_tg_compat_from_user(void *dst, const void *src) -{ - const struct compat_ipt_ulog_info *cl = src; - struct ipt_ulog_info l = { - .nl_group = cl->nl_group, - .copy_range = cl->copy_range, - .qthreshold = cl->qthreshold, - }; - - memcpy(l.prefix, cl->prefix, sizeof(l.prefix)); - memcpy(dst, &l, sizeof(l)); -} - -static int ulog_tg_compat_to_user(void __user *dst, const void *src) -{ - const struct ipt_ulog_info *l = src; - struct compat_ipt_ulog_info cl = { - .nl_group = l->nl_group, - .copy_range = l->copy_range, - .qthreshold = l->qthreshold, - }; - - memcpy(cl.prefix, l->prefix, sizeof(cl.prefix)); - return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0; -} -#endif /* CONFIG_COMPAT */ - -static struct xt_target ulog_tg_reg __read_mostly = { - .name = "ULOG", - .family = NFPROTO_IPV4, - .target = ulog_tg, - .targetsize = sizeof(struct ipt_ulog_info), - .checkentry = ulog_tg_check, -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_ipt_ulog_info), - .compat_from_user = ulog_tg_compat_from_user, - .compat_to_user = ulog_tg_compat_to_user, -#endif - .me = THIS_MODULE, -}; - -static struct nf_logger ipt_ulog_logger __read_mostly = { - .name = "ipt_ULOG", - .logfn = ipt_logfn, - .me = THIS_MODULE, -}; - -static int __net_init ulog_tg_net_init(struct net *net) -{ - int i; - struct ulog_net *ulog = ulog_pernet(net); - struct netlink_kernel_cfg cfg = { - .groups = ULOG_MAXNLGROUPS, - }; - - spin_lock_init(&ulog->lock); - /* initialize ulog_buffers */ - for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - ulog->nlgroup[i] = i; - setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer, - (unsigned long)&ulog->nlgroup[i]); - } - - ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg); - if (!ulog->nflognl) - return -ENOMEM; - - if (nflog) - nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger); - - return 0; -} - -static void __net_exit ulog_tg_net_exit(struct net *net) -{ - ulog_buff_t *ub; - int i; - struct ulog_net *ulog = ulog_pernet(net); - - if (nflog) - nf_log_unset(net, &ipt_ulog_logger); - - netlink_kernel_release(ulog->nflognl); - - /* remove pending timers and free allocated skb's */ - for (i = 0; i < ULOG_MAXNLGROUPS; i++) { - ub = &ulog->ulog_buffers[i]; - pr_debug("timer is deleting\n"); - del_timer(&ub->timer); - - if (ub->skb) { - kfree_skb(ub->skb); - ub->skb = NULL; - } - } -} - -static struct pernet_operations ulog_tg_net_ops = { - .init = ulog_tg_net_init, - .exit = ulog_tg_net_exit, - .id = &ulog_net_id, - .size = sizeof(struct ulog_net), -}; - -static int __init ulog_tg_init(void) -{ - int ret; - pr_debug("init module\n"); - - if (nlbufsiz > 128*1024) { - pr_warn("Netlink buffer has to be <= 128kB\n"); - return -EINVAL; - } - - ret = register_pernet_subsys(&ulog_tg_net_ops); - if (ret) - goto out_pernet; - - ret = xt_register_target(&ulog_tg_reg); - if (ret < 0) - goto out_target; - - if (nflog) - nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); - - return 0; - -out_target: - unregister_pernet_subsys(&ulog_tg_net_ops); -out_pernet: - return ret; -} - -static void __exit ulog_tg_exit(void) -{ - pr_debug("cleanup_module\n"); - if (nflog) - nf_log_unregister(&ipt_ulog_logger); - xt_unregister_target(&ulog_tg_reg); - unregister_pernet_subsys(&ulog_tg_net_ops); -} - -module_init(ulog_tg_init); -module_exit(ulog_tg_exit); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index f1787c04a4dd..6b67d7e9a75d 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -28,222 +28,57 @@ static const struct xt_table nf_nat_ipv4_table = { .af = NFPROTO_IPV4, }; -static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) -{ - /* Force range to this IP; let proto decide mapping for - * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). - */ - struct nf_nat_range range; - - range.flags = 0; - pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, - HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); - - return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); -} - -static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, - const struct net_device *in, - const struct net_device *out, - struct nf_conn *ct) +static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - unsigned int ret; - ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); - if (ret == NF_ACCEPT) { - if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) - ret = alloc_null_binding(ct, hooknum); - } - return ret; + return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.nat_table); } -static unsigned int -nf_nat_ipv4_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - struct nf_conn_nat *nat; - /* maniptype == SRC for postrouting. */ - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); - - /* We never see fragments: conntrack defrags on pre-routing - * and local-out, and nf_nat_out protects post-routing. - */ - NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); - - ct = nf_ct_get(skb, &ctinfo); - /* Can't track? It's not due to stress, or conntrack would - * have dropped it. Hence it's the user's responsibilty to - * packet filter it out, or implement conntrack/NAT for that - * protocol. 8) --RR - */ - if (!ct) - return NF_ACCEPT; - - /* Don't try to NAT if this packet is not conntracked */ - if (nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - ops->hooknum)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - unsigned int ret; - - ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); - if (ret != NF_ACCEPT) - return ret; - } else { - pr_debug("Already setup manip %s for ct %p\n", - maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", - ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - break; - - default: - /* ESTABLISHED */ - NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || - ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); - -oif_changed: - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_DROP; + return nf_nat_ipv4_fn(ops, skb, in, out, iptable_nat_do_chain); } -static unsigned int -nf_nat_ipv4_in(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - unsigned int ret; - __be32 daddr = ip_hdr(skb)->daddr; - - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - daddr != ip_hdr(skb)->daddr) - skb_dst_drop(skb); - - return ret; + return nf_nat_ipv4_in(ops, skb, in, out, iptable_nat_do_chain); } -static unsigned int -nf_nat_ipv4_out(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { -#ifdef CONFIG_XFRM - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - int err; -#endif - unsigned int ret; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if ((ct->tuplehash[dir].tuple.src.u3.ip != - ct->tuplehash[!dir].tuple.dst.u3.ip) || - (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && - ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET); - if (err < 0) - ret = NF_DROP_ERR(err); - } - } -#endif - return ret; + return nf_nat_ipv4_out(ops, skb, in, out, iptable_nat_do_chain); } -static unsigned int -nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned int ret; - int err; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.dst.u3.ip != - ct->tuplehash[!dir].tuple.src.u3.ip) { - err = ip_route_me_harder(skb, RTN_UNSPEC); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#ifdef CONFIG_XFRM - else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && - ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#endif - } - return ret; + return nf_nat_ipv4_local_fn(ops, skb, in, out, iptable_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv4_in, + .hook = iptable_nat_ipv4_in, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, @@ -251,7 +86,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv4_out, + .hook = iptable_nat_ipv4_out, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, @@ -259,7 +94,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv4_local_fn, + .hook = iptable_nat_ipv4_local_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, @@ -267,7 +102,7 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv4_fn, + .hook = iptable_nat_ipv4_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 8127dc802865..a054fe083431 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -314,7 +314,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -ENOENT; } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> @@ -358,7 +358,7 @@ static struct nf_sockopt_ops so_getorigdst = { .pf = PF_INET, .get_optmin = SO_ORIGINAL_DST, .get_optmax = SO_ORIGINAL_DST+1, - .get = &getorigdst, + .get = getorigdst, .owner = THIS_MODULE, }; @@ -388,7 +388,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .invert_tuple = ipv4_invert_tuple, .print_tuple = ipv4_print_tuple, .get_l4proto = ipv4_get_l4proto, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = ipv4_tuple_to_nlattr, .nlattr_tuple_size = ipv4_nlattr_tuple_size, .nlattr_to_tuple = ipv4_nlattr_to_tuple, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index a338dad41b7d..b91b2641adda 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -226,7 +226,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, return icmp_error_message(net, tmpl, skb, ctinfo, hooknum); } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> @@ -408,7 +408,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = .error = icmp_error, .destroy = NULL, .me = NULL, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, .nlattr_to_tuple = icmp_nlattr_to_tuple, diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index b8f6381c7d0b..7e5ca6f2d0cd 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -17,7 +17,7 @@ #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #include <net/netfilter/nf_conntrack_zones.h> @@ -45,12 +45,12 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, { u16 zone = NF_CT_DEFAULT_ZONE; -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb->nfct) zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge && skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; @@ -74,8 +74,8 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, inet->nodefrag) return NF_ACCEPT; -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) -#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#if !IS_ENABLED(CONFIG_NF_NAT) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c new file mode 100644 index 000000000000..ccfc78db12ee --- /dev/null +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -0,0 +1,149 @@ +/* + * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * Based on code from ebt_log from: + * + * Bart De Schuymer <bdschuym@pandora.be> + * Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/xt_LOG.h> +#include <net/netfilter/nf_log.h> + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +struct arppayload { + unsigned char mac_src[ETH_ALEN]; + unsigned char ip_src[4]; + unsigned char mac_dst[ETH_ALEN]; + unsigned char ip_dst[4]; +}; + +static void dump_arp_packet(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int nhoff) +{ + const struct arphdr *ah; + struct arphdr _arph; + const struct arppayload *ap; + struct arppayload _arpp; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", + ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); + + /* If it's for Ethernet and the lengths are OK, then log the ARP + * payload. + */ + if (ah->ar_hrd != htons(1) || + ah->ar_hln != ETH_ALEN || + ah->ar_pln != sizeof(__be32)) + return; + + ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp); + if (ap == NULL) { + nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]", + skb->len - sizeof(_arph)); + return; + } + nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4", + ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); +} + +void nf_log_arp_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, + prefix); + dump_arp_packet(m, loginfo, skb, 0); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_arp_logger __read_mostly = { + .name = "nf_log_arp", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_arp_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_arp_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); + return 0; +} + +static void __net_exit nf_log_arp_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_arp_logger); +} + +static struct pernet_operations nf_log_arp_net_ops = { + .init = nf_log_arp_net_init, + .exit = nf_log_arp_net_exit, +}; + +static int __init nf_log_arp_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_log_arp_net_ops); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_ARP, &nf_arp_logger); + return 0; +} + +static void __exit nf_log_arp_exit(void) +{ + unregister_pernet_subsys(&nf_log_arp_net_ops); + nf_log_unregister(&nf_arp_logger); +} + +module_init(nf_log_arp_init); +module_exit(nf_log_arp_exit); + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter ARP packet logging"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NF_LOGGER(3, 0); diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c new file mode 100644 index 000000000000..078bdca1b607 --- /dev/null +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -0,0 +1,385 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/xt_LOG.h> +#include <net/netfilter/nf_log.h> + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +/* One level of recursion won't kill us */ +static void dump_ipv4_packet(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int iphoff) +{ + struct iphdr _iph; + const struct iphdr *ih; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ + nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + nf_log_buf_add(m, "CE "); + if (ntohs(ih->frag_off) & IP_DF) + nf_log_buf_add(m, "DF "); + if (ntohs(ih->frag_off) & IP_MF) + nf_log_buf_add(m, "MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((logflags & XT_LOG_IPOPT) && + ih->ihl * 4 > sizeof(struct iphdr)) { + const unsigned char *op; + unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + nf_log_buf_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + nf_log_buf_add(m, "%02X", op[i]); + nf_log_buf_add(m, ") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: + if (nf_log_dump_tcp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (nf_log_dump_udp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4)) + return; + break; + case IPPROTO_ICMP: { + struct icmphdr _icmph; + const struct icmphdr *ich; + static const size_t required_len[NR_ICMP_TYPES+1] + = { [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] + = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] + = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] + = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] + = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] + = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ + nf_log_buf_add(m, "PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES && + required_len[ich->type] && + skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + nf_log_buf_add(m, "ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + nf_log_buf_add(m, "PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); + /* Fall through */ + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + nf_log_buf_add(m, "["); + dump_ipv4_packet(m, info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); + nf_log_buf_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH && + ich->code == ICMP_FRAG_NEEDED) { + nf_log_buf_add(m, "MTU=%u ", + ntohs(ich->un.frag.mtu)); + } + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + nf_log_buf_add(m, "PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 10 "PROTO=ESP " */ + nf_log_buf_add(m, "PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + nf_log_buf_add(m, "PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && !iphoff) + nf_log_dump_sk_uid_gid(m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!iphoff && skb->mark) + nf_log_buf_add(m, "MARK=0x%x ", skb->mark); + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* UDP: 10+max(25,20) = 35 */ + /* UDPLITE: 14+max(25,20) = 39 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 252 = 803 */ +} + +static void dump_ipv4_mac_header(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + nf_log_buf_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int i; + + nf_log_buf_add(m, "%02x", *p++); + for (i = 1; i < dev->hard_header_len; i++, p++) + nf_log_buf_add(m, ":%02x", *p); + } + nf_log_buf_add(m, " "); +} + +static void nf_log_ip_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, + out, loginfo, prefix); + + if (in != NULL) + dump_ipv4_mac_header(m, loginfo, skb); + + dump_ipv4_packet(m, loginfo, skb, 0); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_ip_logger __read_mostly = { + .name = "nf_log_ipv4", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_ip_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_ipv4_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); + return 0; +} + +static void __net_exit nf_log_ipv4_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_ip_logger); +} + +static struct pernet_operations nf_log_ipv4_net_ops = { + .init = nf_log_ipv4_net_init, + .exit = nf_log_ipv4_net_exit, +}; + +static int __init nf_log_ipv4_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_log_ipv4_net_ops); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_IPV4, &nf_ip_logger); + return 0; +} + +static void __exit nf_log_ipv4_exit(void) +{ + unregister_pernet_subsys(&nf_log_ipv4_net_ops); + nf_log_unregister(&nf_ip_logger); +} + +module_init(nf_log_ipv4_init); +module_exit(nf_log_ipv4_exit); + +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter IPv4 packet logging"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NF_LOGGER(AF_INET, 0); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index d8b2e14efddc..fc37711e11f3 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -154,6 +154,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, htons(oldlen), htons(datalen), 1); } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) { @@ -169,6 +170,7 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], return 0; } +#endif static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { .l3proto = NFPROTO_IPV4, @@ -177,7 +179,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { .manip_pkt = nf_nat_ipv4_manip_pkt, .csum_update = nf_nat_ipv4_csum_update, .csum_recalc = nf_nat_ipv4_csum_recalc, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_ipv4_nlattr_to_range, +#endif #ifdef CONFIG_XFRM .decode_session = nf_nat_ipv4_decode_session, #endif @@ -250,6 +254,205 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); +unsigned int +nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + /* maniptype == SRC for postrouting. */ + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + + /* We never see fragments: conntrack defrags on pre-routing + * and local-out, and nf_nat_out protects post-routing. + */ + NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); + + ct = nf_ct_get(skb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + * have dropped it. Hence it's the user's responsibilty to + * packet filter it out, or implement conntrack/NAT for that + * protocol. 8) --RR + */ + if (!ct) + return NF_ACCEPT; + + /* Don't try to NAT if this packet is not conntracked */ + if (nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + ops->hooknum)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + unsigned int ret; + + ret = do_chain(ops, skb, in, out, ct); + if (ret != NF_ACCEPT) + return ret; + + if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + break; + + ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + if (ret != NF_ACCEPT) + return ret; + } else { + pr_debug("Already setup manip %s for ct %p\n", + maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + break; + + default: + /* ESTABLISHED */ + NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || + ctinfo == IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + +oif_changed: + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); + +unsigned int +nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + unsigned int ret; + __be32 daddr = ip_hdr(skb)->daddr; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + daddr != ip_hdr(skb)->daddr) + skb_dst_drop(skb); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_in); + +unsigned int +nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ +#ifdef CONFIG_XFRM + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int err; +#endif + unsigned int ret; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if ((ct->tuplehash[dir].tuple.src.u3.ip != + ct->tuplehash[!dir].tuple.dst.u3.ip) || + (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && + ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all)) { + err = nf_xfrm_me_harder(skb, AF_INET); + if (err < 0) + ret = NF_DROP_ERR(err); + } + } +#endif + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_out); + +unsigned int +nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int ret; + int err; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (ct->tuplehash[dir].tuple.dst.u3.ip != + ct->tuplehash[!dir].tuple.src.u3.ip) { + err = ip_route_me_harder(skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#ifdef CONFIG_XFRM + else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && + ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) { + err = nf_xfrm_me_harder(skb, AF_INET); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#endif + } + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn); + static int __init nf_nat_l3proto_ipv4_init(void) { int err; diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c new file mode 100644 index 000000000000..c6eb42100e9a --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -0,0 +1,153 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/atomic.h> +#include <linux/inetdevice.h> +#include <linux/ip.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <net/protocol.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <net/route.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/ipv4/nf_nat_masquerade.h> + +unsigned int +nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, + const struct nf_nat_range *range, + const struct net_device *out) +{ + struct nf_conn *ct; + struct nf_conn_nat *nat; + enum ip_conntrack_info ctinfo; + struct nf_nat_range newrange; + const struct rtable *rt; + __be32 newsrc, nh; + + NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); + + ct = nf_ct_get(skb, &ctinfo); + nat = nfct_nat(ct); + + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + /* Source address is 0.0.0.0 - locally generated packet that is + * probably not supposed to be masqueraded. + */ + if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) + return NF_ACCEPT; + + rt = skb_rtable(skb); + nh = rt_nexthop(rt, ip_hdr(skb)->daddr); + newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); + if (!newsrc) { + pr_info("%s ate my IP address\n", out->name); + return NF_DROP; + } + + nat->masq_index = out->ifindex; + + /* Transfer from original range. */ + memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); + memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.ip = newsrc; + newrange.max_addr.ip = newsrc; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + /* Hand modified range to generic setup. */ + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); + +static int device_cmp(struct nf_conn *i, void *ifindex) +{ + const struct nf_conn_nat *nat = nfct_nat(i); + + if (!nat) + return 0; + if (nf_ct_l3num(i) != NFPROTO_IPV4) + return 0; + return nat->masq_index == (int)(long)ifindex; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + + if (event == NETDEV_DOWN) { + /* Device was downed. Search entire table for + * conntracks which were associated with that device, + * and forget them. + */ + NF_CT_ASSERT(dev->ifindex != 0); + + nf_ct_iterate_cleanup(net, device_cmp, + (void *)(long)dev->ifindex, 0, 0); + } + + return NOTIFY_DONE; +} + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return masq_device_event(this, event, &info); +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); + +void nf_nat_masquerade_ipv4_register_notifier(void) +{ + /* check if the notifier was already set */ + if (atomic_inc_return(&masquerade_notifier_refcount) > 1) + return; + + /* Register for device down reports */ + register_netdevice_notifier(&masq_dev_notifier); + /* Register IP address change reports */ + register_inetaddr_notifier(&masq_inet_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier); + +void nf_nat_masquerade_ipv4_unregister_notifier(void) +{ + /* check if the notifier still has clients */ + if (atomic_dec_return(&masquerade_notifier_refcount) > 0) + return; + + unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 690d890111bb..9414923f1e15 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -124,7 +124,7 @@ static const struct nf_nat_l4proto gre = { .manip_pkt = gre_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = gre_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index eb303471bcf6..4557b4ab8342 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -77,7 +77,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_icmp = { .manip_pkt = icmp_manip_pkt, .in_range = icmp_in_range, .unique_tuple = icmp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c new file mode 100644 index 000000000000..b023b4eb1a96 --- /dev/null +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -0,0 +1,127 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <net/ip.h> +#include <net/tcp.h> +#include <net/route.h> +#include <net/dst.h> +#include <linux/netfilter_ipv4.h> + +/* Send RST reply */ +void nf_send_reset(struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + const struct iphdr *oiph; + struct iphdr *niph; + const struct tcphdr *oth; + struct tcphdr _otcph, *tcph; + + /* IP header checks: fragment. */ + if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) + return; + + oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), + sizeof(_otcph), &_otcph); + if (oth == NULL) + return; + + /* No RST for RST. */ + if (oth->rst) + return; + + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + return; + + /* Check checksum */ + if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) + return; + oiph = ip_hdr(oldskb); + + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + return; + + skb_reserve(nskb, LL_MAX_HEADER); + + skb_reset_network_header(nskb); + niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); + niph->version = 4; + niph->ihl = sizeof(struct iphdr) / 4; + niph->tos = 0; + niph->id = 0; + niph->frag_off = htons(IP_DF); + niph->protocol = IPPROTO_TCP; + niph->check = 0; + niph->saddr = oiph->daddr; + niph->daddr = oiph->saddr; + + skb_reset_transport_header(nskb); + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + memset(tcph, 0, sizeof(*tcph)); + tcph->source = oth->dest; + tcph->dest = oth->source; + tcph->doff = sizeof(struct tcphdr) / 4; + + if (oth->ack) + tcph->seq = oth->ack_seq; + else { + tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + + oldskb->len - ip_hdrlen(oldskb) - + (oth->doff << 2)); + tcph->ack = 1; + } + + tcph->rst = 1; + tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, + niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)tcph - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + /* ip_route_me_harder expects skb->dst to be set */ + skb_dst_set_noref(nskb, skb_dst(oldskb)); + + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(nskb, RTN_UNSPEC)) + goto free_nskb; + + niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); + + /* "Never happens" */ + if (nskb->len > dst_mtu(skb_dst(nskb))) + goto free_nskb; + + nf_ct_attach(nskb, oldskb); + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + /* If we use ip_local_out for bridged traffic, the MAC source on + * the RST will be ours, instead of the destination's. This confuses + * some routers/firewalls, and they drop the packet. So we need to + * build the eth header using the original destination's MAC as the + * source, and send the RST packet directly. + */ + if (oldskb->nf_bridge) { + struct ethhdr *oeth = eth_hdr(oldskb); + nskb->dev = oldskb->nf_bridge->physindev; + niph->tot_len = htons(nskb->len); + ip_send_check(niph); + if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), + oeth->h_source, oeth->h_dest, nskb->len) < 0) + goto free_nskb; + dev_queue_xmit(nskb); + } else +#endif + ip_local_out(nskb); + + return; + + free_nskb: + kfree_skb(nskb); +} +EXPORT_SYMBOL_GPL(nf_send_reset); diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index 3964157d826c..df547bf50078 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -26,136 +26,53 @@ #include <net/netfilter/nf_nat_l3proto.h> #include <net/ip.h> -/* - * NAT chains - */ - -static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) { - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct nf_conn_nat *nat; - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); struct nft_pktinfo pkt; - unsigned int ret; - - if (ct == NULL || nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED + IP_CT_IS_REPLY: - if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - ops->hooknum)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall through */ - case IP_CT_NEW: - if (nf_nat_initialized(ct, maniptype)) - break; - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - ret = nft_do_chain(&pkt, ops); - if (ret != NF_ACCEPT) - return ret; - if (!nf_nat_initialized(ct, maniptype)) { - ret = nf_nat_alloc_null_binding(ct, ops->hooknum); - if (ret != NF_ACCEPT) - return ret; - } - default: - break; - } - - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + return nft_do_chain(&pkt, ops); } -static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - __be32 daddr = ip_hdr(skb)->daddr; - unsigned int ret; - - ret = nf_nat_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - ip_hdr(skb)->daddr != daddr) { - skb_dst_drop(skb); - } - return ret; + return nf_nat_ipv4_fn(ops, skb, in, out, nft_nat_do_chain); } -static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - enum ip_conntrack_info ctinfo __maybe_unused; - const struct nf_conn *ct __maybe_unused; - unsigned int ret; - - ret = nf_nat_fn(ops, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (ct->tuplehash[dir].tuple.src.u3.ip != - ct->tuplehash[!dir].tuple.dst.u3.ip || - ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all) - return nf_xfrm_me_harder(skb, AF_INET) == 0 ? - ret : NF_DROP; - } -#endif - return ret; + return nf_nat_ipv4_in(ops, skb, in, out, nft_nat_do_chain); } -static unsigned int nf_nat_output(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - enum ip_conntrack_info ctinfo; - const struct nf_conn *ct; - unsigned int ret; - - ret = nf_nat_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + return nf_nat_ipv4_out(ops, skb, in, out, nft_nat_do_chain); +} - if (ct->tuplehash[dir].tuple.dst.u3.ip != - ct->tuplehash[!dir].tuple.src.u3.ip) { - if (ip_route_me_harder(skb, RTN_UNSPEC)) - ret = NF_DROP; - } -#ifdef CONFIG_XFRM - else if (ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) - if (nf_xfrm_me_harder(skb, AF_INET)) - ret = NF_DROP; -#endif - } - return ret; +static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return nf_nat_ipv4_local_fn(ops, skb, in, out, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv4 = { @@ -168,10 +85,10 @@ static const struct nf_chain_type nft_chain_nat_ipv4 = { (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .hooks = { - [NF_INET_PRE_ROUTING] = nf_nat_prerouting, - [NF_INET_POST_ROUTING] = nf_nat_postrouting, - [NF_INET_LOCAL_OUT] = nf_nat_output, - [NF_INET_LOCAL_IN] = nf_nat_fn, + [NF_INET_PRE_ROUTING] = nft_nat_ipv4_in, + [NF_INET_POST_ROUTING] = nft_nat_ipv4_out, + [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn, + [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn, }, }; diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c new file mode 100644 index 000000000000..1c636d6b5b50 --- /dev/null +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_masq.h> +#include <net/netfilter/ipv4/nf_nat_masquerade.h> + +static void nft_masq_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_masq *priv = nft_expr_priv(expr); + struct nf_nat_range range; + unsigned int verdict; + + range.flags = priv->flags; + + verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, + &range, pkt->out); + + data[NFT_REG_VERDICT].verdict = verdict; +} + +static struct nft_expr_type nft_masq_ipv4_type; +static const struct nft_expr_ops nft_masq_ipv4_ops = { + .type = &nft_masq_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), + .eval = nft_masq_ipv4_eval, + .init = nft_masq_init, + .dump = nft_masq_dump, +}; + +static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "masq", + .ops = &nft_masq_ipv4_ops, + .policy = nft_masq_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_masq_ipv4_module_init(void) +{ + int ret; + + ret = nft_register_expr(&nft_masq_ipv4_type); + if (ret < 0) + return ret; + + nf_nat_masquerade_ipv4_register_notifier(); + + return ret; +} + +static void __exit nft_masq_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_masq_ipv4_type); + nf_nat_masquerade_ipv4_unregister_notifier(); +} + +module_init(nft_masq_ipv4_module_init); +module_exit(nft_masq_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index e79718a382f2..ed33299c56d1 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -16,7 +16,6 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> -#include <net/icmp.h> #include <net/netfilter/ipv4/nf_reject.h> #include <net/netfilter/nft_reject.h> diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 044a0ddf6a79..57f7c9804139 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -311,7 +311,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; - if ((sysctl_ip_nonlocal_bind == 0 && + if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && isk->freebind == 0 && isk->transparent == 0 && chk_addr_ret != RTN_LOCAL) || chk_addr_ret == RTN_MULTICAST || @@ -911,7 +911,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sin6->sin6_flowinfo = ip6_flowinfo(ip6); sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, - IP6CB(skb)->iif); + inet6_iif(skb)); *addr_len = sizeof(*sin6); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ae0af9386f7c..8e3eb39f84e7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -52,6 +52,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; + unsigned int frag_mem; int orphans, sockets; local_bh_disable(); @@ -71,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - seq_printf(seq, "FRAG: inuse %d memory %d\n", - ip_frag_nqueues(net), ip_frag_mem(net)); + frag_mem = ip_frag_mem(net); + seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem); return 0; } diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 46d6a1c923a8..4b7c0ec65251 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -30,6 +30,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(inet_offloads); int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 2c65160565e1..739db3100c23 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -58,6 +58,7 @@ #include <linux/in_route.h> #include <linux/route.h> #include <linux/skbuff.h> +#include <linux/igmp.h> #include <net/net_namespace.h> #include <net/dst.h> #include <net/sock.h> @@ -174,7 +175,9 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) while (sk) { delivered = 1; - if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { + if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && + ip_mc_sf_allow(sk, iph->daddr, iph->saddr, + skb->dev->ifindex)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ @@ -365,6 +368,8 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->ip_summed = CHECKSUM_NONE; + sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + skb->transport_header = skb->network_header; err = -EFAULT; if (memcpy_fromiovecend((void *)iph, from, 0, length)) @@ -606,6 +611,8 @@ back_from_confirm: &rt, msg->msg_flags); else { + sock_tx_timestamp(sk, &ipc.tx_flags); + if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 190199851c9a..793c0bb8c4fd 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -596,12 +596,12 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) static inline u32 fnhe_hashfun(__be32 daddr) { + static u32 fnhe_hashrnd __read_mostly; u32 hval; - hval = (__force u32) daddr; - hval ^= (hval >> 11) ^ (hval >> 22); - - return hval & (FNHE_HASH_SIZE - 1); + net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); + hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); + return hash_32(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) @@ -628,12 +628,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, spin_lock_bh(&fnhe_lock); - hash = nh->nh_exceptions; + hash = rcu_dereference(nh->nh_exceptions); if (!hash) { hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; - nh->nh_exceptions = hash; + rcu_assign_pointer(nh->nh_exceptions, hash); } hash += hval; @@ -746,7 +746,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow } n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); - if (n) { + if (!IS_ERR(n)) { if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); } else { @@ -1242,7 +1242,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) { - struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); struct fib_nh_exception *fnhe; u32 hval; @@ -1798,8 +1798,6 @@ local_input: no_route: RT_CACHE_STAT_INC(in_no_route); res.type = RTN_UNREACHABLE; - if (err == -ESRCH) - err = -ENETUNREACH; goto local_input; /* @@ -2267,9 +2265,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, return rt; if (flp4->flowi4_proto) - rt = (struct rtable *) xfrm_lookup(net, &rt->dst, - flowi4_to_flowi(flp4), - sk, 0); + rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0); return rt; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c86624b36a62..0431a8f3c8f4 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -25,7 +25,7 @@ extern int sysctl_tcp_syncookies; -static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; +static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); -__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, + __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 79a007c52558..b3c53c8b331e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -286,13 +286,6 @@ static struct ctl_table ipv4_table[] = { .extra2 = &ip_ttl_max, }, { - .procname = "ip_nonlocal_bind", - .data = &sysctl_ip_nonlocal_bind, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_syn_retries", .data = &sysctl_tcp_syn_retries, .maxlen = sizeof(int), @@ -450,6 +443,16 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_IP_MULTICAST + { + .procname = "igmp_qrv", + .data = &sysctl_igmp_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, +#endif { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, @@ -628,15 +631,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, -#ifdef CONFIG_NET_DMA - { - .procname = "tcp_dma_copybreak", - .data = &sysctl_tcp_dma_copybreak, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif { .procname = "tcp_slow_start_after_idle", .data = &sysctl_tcp_slow_start_after_idle, @@ -728,6 +722,22 @@ static struct ctl_table ipv4_table[] = { .extra2 = &one, }, { + .procname = "icmp_msgs_per_sec", + .data = &sysctl_icmp_msgs_per_sec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "icmp_msgs_burst", + .data = &sysctl_icmp_msgs_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), @@ -839,6 +849,13 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "ip_nonlocal_bind", + .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "fwmark_reflect", .data = &init_net.ipv4.sysctl_fwmark_reflect, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9d2118e5fbc7..86023b9be47f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -274,7 +274,6 @@ #include <net/tcp.h> #include <net/xfrm.h> #include <net/ip.h> -#include <net/netdma.h> #include <net/sock.h> #include <asm/uaccess.h> @@ -405,7 +404,7 @@ void tcp_init_sock(struct sock *sk) tp->reordering = sysctl_tcp_reordering; tcp_enable_early_retrans(tp); - icsk->icsk_ca_ops = &tcp_init_congestion_ops; + tcp_assign_congestion_control(sk); tp->tsoffset = 0; @@ -426,6 +425,17 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); +static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb) +{ + if (sk->sk_tsflags) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + sock_tx_timestamp(sk, &shinfo->tx_flags); + if (shinfo->tx_flags & SKBTX_ANY_TSTAMP) + shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + } +} + /* * Wait for a TCP event. * @@ -523,7 +533,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); - if (sk->sk_err) + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; return mask; @@ -598,7 +608,7 @@ static inline bool forced_push(const struct tcp_sock *tp) return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static inline void skb_entail(struct sock *sk, struct sk_buff *skb) +static void skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -607,7 +617,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; tcb->sacked = 0; - skb_header_release(skb); + __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); @@ -952,15 +962,17 @@ new_segment: skb->ip_summed = CHECKSUM_PARTIAL; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + tcp_skb_pcount_set(skb, 0); if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; copied += copy; offset += copy; - if (!(size -= copy)) + if (!(size -= copy)) { + tcp_tx_timestamp(sk, skb); goto out; + } if (skb->len < size_goal || (flags & MSG_OOB)) continue; @@ -1175,13 +1187,6 @@ new_segment: goto wait_for_memory; /* - * All packets are restored as if they have - * already been sent. - */ - if (tp->repair) - TCP_SKB_CB(skb)->when = tcp_time_stamp; - - /* * Check whether we can use HW checksum. */ if (sk->sk_route_caps & NETIF_F_ALL_CSUM) @@ -1190,6 +1195,13 @@ new_segment: skb_entail(sk, skb); copy = size_goal; max = size_goal; + + /* All packets are restored as if they have + * already been sent. skb_mstamp isn't set to + * avoid wrong rtt estimation. + */ + if (tp->repair) + TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; } /* Try to append data to the end of skb. */ @@ -1248,12 +1260,14 @@ new_segment: tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->gso_segs = 0; + tcp_skb_pcount_set(skb, 0); from += copy; copied += copy; - if ((seglen -= copy) == 0 && iovlen == 0) + if ((seglen -= copy) == 0 && iovlen == 0) { + tcp_tx_timestamp(sk, skb); goto out; + } if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) continue; @@ -1379,7 +1393,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -void tcp_cleanup_rbuf(struct sock *sk, int copied) +static void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; @@ -1455,39 +1469,6 @@ static void tcp_prequeue_process(struct sock *sk) tp->ucopy.memory = 0; } -#ifdef CONFIG_NET_DMA -static void tcp_service_net_dma(struct sock *sk, bool wait) -{ - dma_cookie_t done, used; - dma_cookie_t last_issued; - struct tcp_sock *tp = tcp_sk(sk); - - if (!tp->ucopy.dma_chan) - return; - - last_issued = tp->ucopy.dma_cookie; - dma_async_issue_pending(tp->ucopy.dma_chan); - - do { - if (dma_async_is_tx_complete(tp->ucopy.dma_chan, - last_issued, &done, - &used) == DMA_COMPLETE) { - /* Safe to free early-copied skbs now */ - __skb_queue_purge(&sk->sk_async_wait_queue); - break; - } else { - struct sk_buff *skb; - while ((skb = skb_peek(&sk->sk_async_wait_queue)) && - (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_COMPLETE)) { - __skb_dequeue(&sk->sk_async_wait_queue); - kfree_skb(skb); - } - } - } while (wait); -} -#endif - static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1495,9 +1476,9 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; - if (tcp_hdr(skb)->syn) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) offset--; - if (offset < skb->len || tcp_hdr(skb)->fin) { + if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; } @@ -1505,7 +1486,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); } return NULL; } @@ -1570,12 +1551,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (offset + 1 != skb->len) continue; } - if (tcp_hdr(skb)->fin) { - sk_eat_skb(sk, skb, false); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + sk_eat_skb(sk, skb); ++seq; break; } - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); if (!desc->count) break; tp->copied_seq = seq; @@ -1613,10 +1594,12 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; - bool copied_early = false; struct sk_buff *skb; u32 urg_hole = 0; + if (unlikely(flags & MSG_ERRQUEUE)) + return ip_recv_error(sk, msg, len, addr_len); + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && (sk->sk_state == TCP_ESTABLISHED)) sk_busy_loop(sk, nonblock); @@ -1656,28 +1639,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); -#ifdef CONFIG_NET_DMA - tp->ucopy.dma_chan = NULL; - preempt_disable(); - skb = skb_peek_tail(&sk->sk_receive_queue); - { - int available = 0; - - if (skb) - available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); - if ((available < target) && - (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && - !sysctl_tcp_low_latency && - net_dma_find_channel()) { - preempt_enable(); - tp->ucopy.pinned_list = - dma_pin_iovec_pages(msg->msg_iov, len); - } else { - preempt_enable(); - } - } -#endif - do { u32 offset; @@ -1704,11 +1665,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, break; offset = *seq - TCP_SKB_CB(skb)->seq; - if (tcp_hdr(skb)->syn) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) offset--; if (offset < skb->len) goto found_ok_skb; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", @@ -1808,16 +1769,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* __ Set realtime policy in scheduler __ */ } -#ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) { - if (tp->rcv_wnd == 0 && - !skb_queue_empty(&sk->sk_async_wait_queue)) { - tcp_service_net_dma(sk, true); - tcp_cleanup_rbuf(sk, copied); - } else - dma_async_issue_pending(tp->ucopy.dma_chan); - } -#endif if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1825,11 +1776,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else sk_wait_data(sk, &timeo); -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, false); /* Don't block */ - tp->ucopy.wakeup = 0; -#endif - if (user_recv) { int chunk; @@ -1887,43 +1833,13 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { -#ifdef CONFIG_NET_DMA - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - - if (tp->ucopy.dma_chan) { - tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( - tp->ucopy.dma_chan, skb, offset, - msg->msg_iov, used, - tp->ucopy.pinned_list); - - if (tp->ucopy.dma_cookie < 0) { - - pr_alert("%s: dma_cookie < 0\n", - __func__); - - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } - - dma_async_issue_pending(tp->ucopy.dma_chan); - - if ((offset + used) == skb->len) - copied_early = true; - - } else -#endif - { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; } } @@ -1941,21 +1857,17 @@ skip_copy: if (used + offset < skb->len) continue; - if (tcp_hdr(skb)->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); break; } while (len > 0); @@ -1978,16 +1890,6 @@ skip_copy: tp->ucopy.len = 0; } -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, true); /* Wait for queue to drain */ - tp->ucopy.dma_chan = NULL; - - if (tp->ucopy.pinned_list) { - dma_unpin_iovec_pages(tp->ucopy.pinned_list); - tp->ucopy.pinned_list = NULL; - } -#endif - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ @@ -2142,8 +2044,10 @@ void tcp_close(struct sock *sk, long timeout) * reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - - tcp_hdr(skb)->fin; + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + len--; data_was_unread += len; __kfree_skb(skb); } @@ -2331,9 +2235,6 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); -#ifdef CONFIG_NET_DMA - __skb_queue_purge(&sk->sk_async_wait_queue); -#endif inet->inet_dport = 0; @@ -2673,7 +2574,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; #endif case TCP_USER_TIMEOUT: - /* Cap the max timeout in ms TCP will retry/retrans + /* Cap the max time in ms TCP will retry or probe the window * before giving up and aborting (ETIMEDOUT) a connection. */ if (val < 0) @@ -3152,7 +3053,7 @@ static int __init set_thash_entries(char *str) } __setup("thash_entries=", set_thash_entries); -static void tcp_init_mem(void) +static void __init tcp_init_mem(void) { unsigned long limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); @@ -3170,8 +3071,8 @@ void __init tcp_init(void) BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); - percpu_counter_init(&tcp_sockets_allocated, 0); - percpu_counter_init(&tcp_orphan_count, 0); + percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); + percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, @@ -3238,8 +3139,6 @@ void __init tcp_init(void) tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_metrics_init(); - - tcp_register_congestion_control(&tcp_reno); - + BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); } diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index d5de69bc04f5..bb395d46a389 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -17,7 +17,6 @@ #include <linux/module.h> #include <net/tcp.h> - #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta */ @@ -46,11 +45,10 @@ MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(smooth_part, int, 0644); MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); - /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ - u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ @@ -103,7 +101,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) /* binary increase */ if (cwnd < ca->last_max_cwnd) { - __u32 dist = (ca->last_max_cwnd - cwnd) + __u32 dist = (ca->last_max_cwnd - cwnd) / BICTCP_B; if (dist > max_increment) @@ -154,7 +152,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) bictcp_update(ca, tp->snd_cwnd); tcp_cong_avoid_ai(tp, ca->cnt); } - } /* @@ -177,7 +174,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) ca->loss_cwnd = tp->snd_cwnd; - if (tp->snd_cwnd <= low_window) return max(tp->snd_cwnd >> 1U, 2U); else @@ -188,6 +184,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct bictcp *ca = inet_csk_ca(sk); + return max(tp->snd_cwnd, ca->loss_cwnd); } @@ -206,12 +203,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) if (icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; } } - static struct tcp_congestion_ops bictcp __read_mostly = { .init = bictcp_init, .ssthresh = bictcp_recalc_ssthresh, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 7b09d8b49fa5..b1c5970d47a1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -74,24 +74,34 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Assign choice of congestion control. */ -void tcp_init_congestion_control(struct sock *sk) +void tcp_assign_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; - /* if no choice made yet assign the current value set as default */ - if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { - rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (try_module_get(ca->owner)) { - icsk->icsk_ca_ops = ca; - break; - } - - /* fallback to next available */ + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (likely(try_module_get(ca->owner))) { + icsk->icsk_ca_ops = ca; + goto out; } - rcu_read_unlock(); + /* Fallback to next available. The last really + * guaranteed fallback is Reno from this list. + */ } +out: + rcu_read_unlock(); + + /* Clear out private data before diag gets it and + * the ca has not been initialized. + */ + if (ca->get_info) + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); +} + +void tcp_init_congestion_control(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); @@ -142,7 +152,6 @@ static int __init tcp_congestion_default(void) } late_initcall(tcp_congestion_default); - /* Build string with list of available congestion control values */ void tcp_get_available_congestion_control(char *buf, size_t maxlen) { @@ -154,7 +163,6 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); - } rcu_read_unlock(); } @@ -186,7 +194,6 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); - } rcu_read_unlock(); } @@ -230,7 +237,6 @@ out: return ret; } - /* Change congestion control for socket */ int tcp_set_congestion_control(struct sock *sk, const char *name) { @@ -285,15 +291,13 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ -int tcp_slow_start(struct tcp_sock *tp, u32 acked) +void tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = tp->snd_cwnd + acked; if (cwnd > tp->snd_ssthresh) cwnd = tp->snd_ssthresh + 1; - acked -= cwnd - tp->snd_cwnd; tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); - return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); @@ -337,6 +341,7 @@ EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + return max(tp->snd_cwnd >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); @@ -348,15 +353,3 @@ struct tcp_congestion_ops tcp_reno = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, }; - -/* Initial congestion control used (until SYN) - * really reno under another name so we can tell difference - * during tcp_set_default_congestion_control - */ -struct tcp_congestion_ops tcp_init_congestion_ops = { - .name = "", - .owner = THIS_MODULE, - .ssthresh = tcp_reno_ssthresh, - .cong_avoid = tcp_reno_cong_avoid, -}; -EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index a9bd8a4828a9..20de0118c98e 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -82,12 +82,13 @@ MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (mse /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ - u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 bic_origin_point;/* origin point of bic function */ - u32 bic_K; /* time to origin point from the beginning of the current epoch */ + u32 bic_K; /* time to origin point + from the beginning of the current epoch */ u32 delay_min; /* min delay (msec << 3) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ @@ -219,7 +220,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->last_time = tcp_time_stamp; if (ca->epoch_start == 0) { - ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ + ca->epoch_start = tcp_time_stamp; /* record beginning */ ca->ack_cnt = 1; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ @@ -263,9 +264,9 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) /* c/rtt * (t-K)^3 */ delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); - if (t < ca->bic_K) /* below origin*/ + if (t < ca->bic_K) /* below origin*/ bic_target = ca->bic_origin_point - delta; - else /* above origin*/ + else /* above origin*/ bic_target = ca->bic_origin_point + delta; /* cubic function - calc bictcp_cnt*/ @@ -285,13 +286,14 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; + delta = (cwnd * scale) >> 3; while (ca->ack_cnt > delta) { /* update tcp cwnd */ ca->ack_cnt -= delta; ca->tcp_cwnd++; } - if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ + if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ delta = ca->tcp_cwnd - cwnd; max_cnt = cwnd / delta; if (ca->cnt > max_cnt) @@ -320,7 +322,6 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) bictcp_update(ca, tp->snd_cwnd); tcp_cong_avoid_ai(tp, ca->cnt); } - } static u32 bictcp_recalc_ssthresh(struct sock *sk) @@ -452,7 +453,8 @@ static int __init cubictcp_register(void) * based on SRTT of 100ms */ - beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); + beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3 + / (BICTCP_BETA_SCALE - beta); cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c new file mode 100644 index 000000000000..b504371af742 --- /dev/null +++ b/net/ipv4/tcp_dctcp.c @@ -0,0 +1,344 @@ +/* DataCenter TCP (DCTCP) congestion control. + * + * http://simula.stanford.edu/~alizade/Site/DCTCP.html + * + * This is an implementation of DCTCP over Reno, an enhancement to the + * TCP congestion control algorithm designed for data centers. DCTCP + * leverages Explicit Congestion Notification (ECN) in the network to + * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet + * the following three data center transport requirements: + * + * - High burst tolerance (incast due to partition/aggregate) + * - Low latency (short flows, queries) + * - High throughput (continuous data updates, large file transfers) + * with commodity shallow buffered switches + * + * The algorithm is described in detail in the following two papers: + * + * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, + * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: + * "Data Center TCP (DCTCP)", Data Center Networks session + * Proc. ACM SIGCOMM, New Delhi, 2010. + * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + * + * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: + * "Analysis of DCTCP: Stability, Convergence, and Fairness" + * Proc. ACM SIGMETRICS, San Jose, 2011. + * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf + * + * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh. + * + * Authors: + * + * Daniel Borkmann <dborkman@redhat.com> + * Florian Westphal <fw@strlen.de> + * Glenn Judd <glenn.judd@morganstanley.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + */ + +#include <linux/module.h> +#include <linux/mm.h> +#include <net/tcp.h> +#include <linux/inet_diag.h> + +#define DCTCP_MAX_ALPHA 1024U + +struct dctcp { + u32 acked_bytes_ecn; + u32 acked_bytes_total; + u32 prior_snd_una; + u32 prior_rcv_nxt; + u32 dctcp_alpha; + u32 next_seq; + u32 ce_state; + u32 delayed_ack_reserved; +}; + +static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ +module_param(dctcp_shift_g, uint, 0644); +MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); + +static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; +module_param(dctcp_alpha_on_init, uint, 0644); +MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); + +static unsigned int dctcp_clamp_alpha_on_loss __read_mostly; +module_param(dctcp_clamp_alpha_on_loss, uint, 0644); +MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss, + "parameter for clamping alpha on loss"); + +static struct tcp_congestion_ops dctcp_reno; + +static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) +{ + ca->next_seq = tp->snd_nxt; + + ca->acked_bytes_ecn = 0; + ca->acked_bytes_total = 0; +} + +static void dctcp_init(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + if ((tp->ecn_flags & TCP_ECN_OK) || + (sk->sk_state == TCP_LISTEN || + sk->sk_state == TCP_CLOSE)) { + struct dctcp *ca = inet_csk_ca(sk); + + ca->prior_snd_una = tp->snd_una; + ca->prior_rcv_nxt = tp->rcv_nxt; + + ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); + + ca->delayed_ack_reserved = 0; + ca->ce_state = 0; + + dctcp_reset(tp, ca); + return; + } + + /* No ECN support? Fall back to Reno. Also need to clear + * ECT from sk since it is set during 3WHS for DCTCP. + */ + inet_csk(sk)->icsk_ca_ops = &dctcp_reno; + INET_ECN_dontxmit(sk); +} + +static u32 dctcp_ssthresh(struct sock *sk) +{ + const struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); +} + +/* Minimal DCTP CE state machine: + * + * S: 0 <- last pkt was non-CE + * 1 <- last pkt was CE + */ + +static void dctcp_ce_state_0_to_1(struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* State has changed from CE=0 to CE=1 and delayed + * ACK has not sent yet. + */ + if (!ca->ce_state && ca->delayed_ack_reserved) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ + tmp_rcv_nxt = tp->rcv_nxt; + + /* Generate previous ack with CE=0. */ + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->rcv_nxt = ca->prior_rcv_nxt; + + tcp_send_ack(sk); + + /* Recover current rcv_nxt. */ + tp->rcv_nxt = tmp_rcv_nxt; + } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 1; + + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; +} + +static void dctcp_ce_state_1_to_0(struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* State has changed from CE=1 to CE=0 and delayed + * ACK has not sent yet. + */ + if (ca->ce_state && ca->delayed_ack_reserved) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ + tmp_rcv_nxt = tp->rcv_nxt; + + /* Generate previous ack with CE=1. */ + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + tp->rcv_nxt = ca->prior_rcv_nxt; + + tcp_send_ack(sk); + + /* Recover current rcv_nxt. */ + tp->rcv_nxt = tmp_rcv_nxt; + } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 0; + + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static void dctcp_update_alpha(struct sock *sk, u32 flags) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct dctcp *ca = inet_csk_ca(sk); + u32 acked_bytes = tp->snd_una - ca->prior_snd_una; + + /* If ack did not advance snd_una, count dupack as MSS size. + * If ack did update window, do not count it at all. + */ + if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE)) + acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; + if (acked_bytes) { + ca->acked_bytes_total += acked_bytes; + ca->prior_snd_una = tp->snd_una; + + if (flags & CA_ACK_ECE) + ca->acked_bytes_ecn += acked_bytes; + } + + /* Expired RTT */ + if (!before(tp->snd_una, ca->next_seq)) { + /* For avoiding denominator == 1. */ + if (ca->acked_bytes_total == 0) + ca->acked_bytes_total = 1; + + /* alpha = (1 - g) * alpha + g * F */ + ca->dctcp_alpha = ca->dctcp_alpha - + (ca->dctcp_alpha >> dctcp_shift_g) + + (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / + ca->acked_bytes_total; + + if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) + /* Clamp dctcp_alpha to max. */ + ca->dctcp_alpha = DCTCP_MAX_ALPHA; + + dctcp_reset(tp, ca); + } +} + +static void dctcp_state(struct sock *sk, u8 new_state) +{ + if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) { + struct dctcp *ca = inet_csk_ca(sk); + + /* If this extension is enabled, we clamp dctcp_alpha to + * max on packet loss; the motivation is that dctcp_alpha + * is an indicator to the extend of congestion and packet + * loss is an indicator of extreme congestion; setting + * this in practice turned out to be beneficial, and + * effectively assumes total congestion which reduces the + * window by half. + */ + ca->dctcp_alpha = DCTCP_MAX_ALPHA; + } +} + +static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) +{ + struct dctcp *ca = inet_csk_ca(sk); + + switch (ev) { + case CA_EVENT_DELAYED_ACK: + if (!ca->delayed_ack_reserved) + ca->delayed_ack_reserved = 1; + break; + case CA_EVENT_NON_DELAYED_ACK: + if (ca->delayed_ack_reserved) + ca->delayed_ack_reserved = 0; + break; + default: + /* Don't care for the rest. */ + break; + } +} + +static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) +{ + switch (ev) { + case CA_EVENT_ECN_IS_CE: + dctcp_ce_state_0_to_1(sk); + break; + case CA_EVENT_ECN_NO_CE: + dctcp_ce_state_1_to_0(sk); + break; + case CA_EVENT_DELAYED_ACK: + case CA_EVENT_NON_DELAYED_ACK: + dctcp_update_ack_reserved(sk, ev); + break; + default: + /* Don't care for the rest. */ + break; + } +} + +static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + const struct dctcp *ca = inet_csk_ca(sk); + + /* Fill it also in case of VEGASINFO due to req struct limits. + * We can still correctly retrieve it later. + */ + if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcp_dctcp_info info; + + memset(&info, 0, sizeof(info)); + if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { + info.dctcp_enabled = 1; + info.dctcp_ce_state = (u16) ca->ce_state; + info.dctcp_alpha = ca->dctcp_alpha; + info.dctcp_ab_ecn = ca->acked_bytes_ecn; + info.dctcp_ab_tot = ca->acked_bytes_total; + } + + nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); + } +} + +static struct tcp_congestion_ops dctcp __read_mostly = { + .init = dctcp_init, + .in_ack_event = dctcp_update_alpha, + .cwnd_event = dctcp_cwnd_event, + .ssthresh = dctcp_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .set_state = dctcp_state, + .get_info = dctcp_get_info, + .flags = TCP_CONG_NEEDS_ECN, + .owner = THIS_MODULE, + .name = "dctcp", +}; + +static struct tcp_congestion_ops dctcp_reno __read_mostly = { + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .get_info = dctcp_get_info, + .owner = THIS_MODULE, + .name = "dctcp-reno", +}; + +static int __init dctcp_register(void) +{ + BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&dctcp); +} + +static void __exit dctcp_unregister(void) +{ + tcp_unregister_congestion_control(&dctcp); +} + +module_init(dctcp_register); +module_exit(dctcp_unregister); + +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>"); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("DataCenter TCP (DCTCP)"); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index ed3f2ad42e0f..0d73f9ddb55b 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -9,7 +9,6 @@ * 2 of the License, or (at your option) any later version. */ - #include <linux/module.h> #include <linux/inet_diag.h> @@ -35,13 +34,13 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) + struct inet_diag_req_v2 *r, struct nlattr *bc) { inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); } static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, - struct inet_diag_req_v2 *req) + struct inet_diag_req_v2 *req) { return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 9771563ab564..815c85e3b1e0 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -115,7 +115,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { struct in6_addr *buf = (struct in6_addr *) tmp.val; - int i = 4; + int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 1c4908280d92..882c08aae2f5 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -9,7 +9,6 @@ #include <linux/module.h> #include <net/tcp.h> - /* From AIMD tables from RFC 3649 appendix B, * with fixed-point MD scaled <<8. */ @@ -17,78 +16,78 @@ static const struct hstcp_aimd_val { unsigned int cwnd; unsigned int md; } hstcp_aimd_vals[] = { - { 38, 128, /* 0.50 */ }, - { 118, 112, /* 0.44 */ }, - { 221, 104, /* 0.41 */ }, - { 347, 98, /* 0.38 */ }, - { 495, 93, /* 0.37 */ }, - { 663, 89, /* 0.35 */ }, - { 851, 86, /* 0.34 */ }, - { 1058, 83, /* 0.33 */ }, - { 1284, 81, /* 0.32 */ }, - { 1529, 78, /* 0.31 */ }, - { 1793, 76, /* 0.30 */ }, - { 2076, 74, /* 0.29 */ }, - { 2378, 72, /* 0.28 */ }, - { 2699, 71, /* 0.28 */ }, - { 3039, 69, /* 0.27 */ }, - { 3399, 68, /* 0.27 */ }, - { 3778, 66, /* 0.26 */ }, - { 4177, 65, /* 0.26 */ }, - { 4596, 64, /* 0.25 */ }, - { 5036, 62, /* 0.25 */ }, - { 5497, 61, /* 0.24 */ }, - { 5979, 60, /* 0.24 */ }, - { 6483, 59, /* 0.23 */ }, - { 7009, 58, /* 0.23 */ }, - { 7558, 57, /* 0.22 */ }, - { 8130, 56, /* 0.22 */ }, - { 8726, 55, /* 0.22 */ }, - { 9346, 54, /* 0.21 */ }, - { 9991, 53, /* 0.21 */ }, - { 10661, 52, /* 0.21 */ }, - { 11358, 52, /* 0.20 */ }, - { 12082, 51, /* 0.20 */ }, - { 12834, 50, /* 0.20 */ }, - { 13614, 49, /* 0.19 */ }, - { 14424, 48, /* 0.19 */ }, - { 15265, 48, /* 0.19 */ }, - { 16137, 47, /* 0.19 */ }, - { 17042, 46, /* 0.18 */ }, - { 17981, 45, /* 0.18 */ }, - { 18955, 45, /* 0.18 */ }, - { 19965, 44, /* 0.17 */ }, - { 21013, 43, /* 0.17 */ }, - { 22101, 43, /* 0.17 */ }, - { 23230, 42, /* 0.17 */ }, - { 24402, 41, /* 0.16 */ }, - { 25618, 41, /* 0.16 */ }, - { 26881, 40, /* 0.16 */ }, - { 28193, 39, /* 0.16 */ }, - { 29557, 39, /* 0.15 */ }, - { 30975, 38, /* 0.15 */ }, - { 32450, 38, /* 0.15 */ }, - { 33986, 37, /* 0.15 */ }, - { 35586, 36, /* 0.14 */ }, - { 37253, 36, /* 0.14 */ }, - { 38992, 35, /* 0.14 */ }, - { 40808, 35, /* 0.14 */ }, - { 42707, 34, /* 0.13 */ }, - { 44694, 33, /* 0.13 */ }, - { 46776, 33, /* 0.13 */ }, - { 48961, 32, /* 0.13 */ }, - { 51258, 32, /* 0.13 */ }, - { 53677, 31, /* 0.12 */ }, - { 56230, 30, /* 0.12 */ }, - { 58932, 30, /* 0.12 */ }, - { 61799, 29, /* 0.12 */ }, - { 64851, 28, /* 0.11 */ }, - { 68113, 28, /* 0.11 */ }, - { 71617, 27, /* 0.11 */ }, - { 75401, 26, /* 0.10 */ }, - { 79517, 26, /* 0.10 */ }, - { 84035, 25, /* 0.10 */ }, - { 89053, 24, /* 0.10 */ }, + { 38, 128, /* 0.50 */ }, + { 118, 112, /* 0.44 */ }, + { 221, 104, /* 0.41 */ }, + { 347, 98, /* 0.38 */ }, + { 495, 93, /* 0.37 */ }, + { 663, 89, /* 0.35 */ }, + { 851, 86, /* 0.34 */ }, + { 1058, 83, /* 0.33 */ }, + { 1284, 81, /* 0.32 */ }, + { 1529, 78, /* 0.31 */ }, + { 1793, 76, /* 0.30 */ }, + { 2076, 74, /* 0.29 */ }, + { 2378, 72, /* 0.28 */ }, + { 2699, 71, /* 0.28 */ }, + { 3039, 69, /* 0.27 */ }, + { 3399, 68, /* 0.27 */ }, + { 3778, 66, /* 0.26 */ }, + { 4177, 65, /* 0.26 */ }, + { 4596, 64, /* 0.25 */ }, + { 5036, 62, /* 0.25 */ }, + { 5497, 61, /* 0.24 */ }, + { 5979, 60, /* 0.24 */ }, + { 6483, 59, /* 0.23 */ }, + { 7009, 58, /* 0.23 */ }, + { 7558, 57, /* 0.22 */ }, + { 8130, 56, /* 0.22 */ }, + { 8726, 55, /* 0.22 */ }, + { 9346, 54, /* 0.21 */ }, + { 9991, 53, /* 0.21 */ }, + { 10661, 52, /* 0.21 */ }, + { 11358, 52, /* 0.20 */ }, + { 12082, 51, /* 0.20 */ }, + { 12834, 50, /* 0.20 */ }, + { 13614, 49, /* 0.19 */ }, + { 14424, 48, /* 0.19 */ }, + { 15265, 48, /* 0.19 */ }, + { 16137, 47, /* 0.19 */ }, + { 17042, 46, /* 0.18 */ }, + { 17981, 45, /* 0.18 */ }, + { 18955, 45, /* 0.18 */ }, + { 19965, 44, /* 0.17 */ }, + { 21013, 43, /* 0.17 */ }, + { 22101, 43, /* 0.17 */ }, + { 23230, 42, /* 0.17 */ }, + { 24402, 41, /* 0.16 */ }, + { 25618, 41, /* 0.16 */ }, + { 26881, 40, /* 0.16 */ }, + { 28193, 39, /* 0.16 */ }, + { 29557, 39, /* 0.15 */ }, + { 30975, 38, /* 0.15 */ }, + { 32450, 38, /* 0.15 */ }, + { 33986, 37, /* 0.15 */ }, + { 35586, 36, /* 0.14 */ }, + { 37253, 36, /* 0.14 */ }, + { 38992, 35, /* 0.14 */ }, + { 40808, 35, /* 0.14 */ }, + { 42707, 34, /* 0.13 */ }, + { 44694, 33, /* 0.13 */ }, + { 46776, 33, /* 0.13 */ }, + { 48961, 32, /* 0.13 */ }, + { 51258, 32, /* 0.13 */ }, + { 53677, 31, /* 0.12 */ }, + { 56230, 30, /* 0.12 */ }, + { 58932, 30, /* 0.12 */ }, + { 61799, 29, /* 0.12 */ }, + { 64851, 28, /* 0.11 */ }, + { 68113, 28, /* 0.11 */ }, + { 71617, 27, /* 0.11 */ }, + { 75401, 26, /* 0.10 */ }, + { 79517, 26, /* 0.10 */ }, + { 84035, 25, /* 0.10 */ }, + { 89053, 24, /* 0.10 */ }, }; #define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 031361311a8b..58469fff6c18 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -98,7 +98,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt) } } -static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt) +static void measure_achieved_throughput(struct sock *sk, + u32 pkts_acked, s32 rtt) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); @@ -148,8 +149,8 @@ static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) if (use_bandwidth_switch) { u32 maxB = ca->maxB; u32 old_maxB = ca->old_maxB; - ca->old_maxB = ca->maxB; + ca->old_maxB = ca->maxB; if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) { ca->beta = BETA_MIN; ca->modeswitch = 0; @@ -270,6 +271,7 @@ static void htcp_state(struct sock *sk, u8 new_state) case TCP_CA_Open: { struct htcp *ca = inet_csk_ca(sk); + if (ca->undo_last_cong) { ca->last_cong = jiffies; ca->undo_last_cong = 0; diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index d8f8f05a4951..f963b274f2b0 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -29,7 +29,6 @@ static int rtt0 = 25; module_param(rtt0, int, 0644); MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); - /* This is called to refresh values for hybla parameters */ static inline void hybla_recalc_param (struct sock *sk) { diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 5999b3972e64..1d5a30a90adf 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -284,7 +284,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT; if (delta >= tp->snd_cwnd) { tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd, - (u32) tp->snd_cwnd_clamp); + (u32)tp->snd_cwnd_clamp); tp->snd_cwnd_cnt = 0; } } @@ -299,7 +299,6 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); } - /* Extract info for Tcp socket info provided via netlink. */ static void tcp_illinois_info(struct sock *sk, u32 ext, struct sk_buff *skb) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 40639c288dc2..00a41499d52c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -73,7 +73,7 @@ #include <net/inet_common.h> #include <linux/ipsec.h> #include <asm/unaligned.h> -#include <net/netdma.h> +#include <linux/errqueue.h> int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; @@ -200,28 +200,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk) return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } -static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) +static void tcp_ecn_queue_cwr(struct tcp_sock *tp) { if (tp->ecn_flags & TCP_ECN_OK) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } -static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) +static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) { if (tcp_hdr(skb)->cwr) tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } -static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) +static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) { tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } -static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) { - if (!(tp->ecn_flags & TCP_ECN_OK)) - return; - switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, @@ -232,30 +229,43 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s tcp_enter_quickack_mode((struct sock *)tp); break; case INET_ECN_CE: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode((struct sock *)tp); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } - /* fallinto */ + tp->ecn_flags |= TCP_ECN_SEEN; + break; default: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; + break; } } -static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) +static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +{ + if (tp->ecn_flags & TCP_ECN_OK) + __tcp_ecn_check_ce(tp, skb); +} + +static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } -static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) +static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } -static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) +static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) return true; @@ -652,7 +662,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) } icsk->icsk_ack.lrcvtime = now; - TCP_ECN_check_ce(tp, skb); + tcp_ecn_check_ce(tp, skb); if (skb->len >= 128) tcp_grow_window(sk, skb); @@ -1294,9 +1304,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; - skb_shinfo(prev)->gso_segs += pcount; - BUG_ON(skb_shinfo(skb)->gso_segs < pcount); - skb_shinfo(skb)->gso_segs -= pcount; + tcp_skb_pcount_add(prev, pcount); + BUG_ON(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK @@ -1309,7 +1319,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ - if (skb_shinfo(skb)->gso_segs <= 1) { + if (tcp_skb_pcount(skb) <= 1) { skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_type = 0; } @@ -1887,33 +1897,34 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->sacked_out = 0; } -static void tcp_clear_retrans_partial(struct tcp_sock *tp) +void tcp_clear_retrans(struct tcp_sock *tp) { tp->retrans_out = 0; tp->lost_out = 0; - tp->undo_marker = 0; tp->undo_retrans = -1; + tp->fackets_out = 0; + tp->sacked_out = 0; } -void tcp_clear_retrans(struct tcp_sock *tp) +static inline void tcp_init_undo(struct tcp_sock *tp) { - tcp_clear_retrans_partial(tp); - - tp->fackets_out = 0; - tp->sacked_out = 0; + tp->undo_marker = tp->snd_una; + /* Retransmission still in flight may cause DSACKs later. */ + tp->undo_retrans = tp->retrans_out ? : -1; } -/* Enter Loss state. If "how" is not zero, forget all SACK information +/* Enter Loss state. If we detect SACK reneging, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. */ -void tcp_enter_loss(struct sock *sk, int how) +void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; bool new_recovery = false; + bool is_reneg; /* is receiver reneging on SACKs? */ /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || @@ -1923,18 +1934,22 @@ void tcp_enter_loss(struct sock *sk, int how) tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); + tcp_init_undo(tp); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tcp_clear_retrans_partial(tp); + tp->retrans_out = 0; + tp->lost_out = 0; if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - tp->undo_marker = tp->snd_una; - if (how) { + skb = tcp_write_queue_head(sk); + is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); + if (is_reneg) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; tp->fackets_out = 0; } @@ -1944,11 +1959,8 @@ void tcp_enter_loss(struct sock *sk, int how) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) - tp->undo_marker = 0; - TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -1966,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk, int how) sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; - TCP_ECN_queue_cwr(tp); + tcp_ecn_queue_cwr(tp); /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous * loss recovery is underway except recurring timeout(s) on @@ -1981,19 +1993,21 @@ void tcp_enter_loss(struct sock *sk, int how) * remembered SACKs do not reflect real state of receiver i.e. * receiver _host_ is heavily congested (or buggy). * - * Do processing similar to RTO timeout. + * To avoid big spurious retransmission bursts due to transient SACK + * scoreboard oddities that look like reneging, we give the receiver a + * little time (max(RTT/2, 10ms)) to send us some more ACKs that will + * restore sanity to the SACK scoreboard. If the apparent reneging + * persists until this RTO then we'll clear the SACK scoreboard. */ static bool tcp_check_sack_reneging(struct sock *sk, int flag) { if (flag & FLAG_SACK_RENEGING) { - struct inet_connection_sock *icsk = inet_csk(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); + struct tcp_sock *tp = tcp_sk(sk); + unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), + msecs_to_jiffies(10)); - tcp_enter_loss(sk, 1); - icsk->icsk_retransmits++; - tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - icsk->icsk_rto, TCP_RTO_MAX); + delay, TCP_RTO_MAX); return true; } return false; @@ -2356,7 +2370,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; - TCP_ECN_withdraw_cwr(tp); + tcp_ecn_withdraw_cwr(tp); } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); @@ -2475,7 +2489,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) * losses and/or application stalls), do not perform any further cwnd * reductions, but instead slow start up to ssthresh. */ -static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) +static void tcp_init_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2485,9 +2499,8 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) tp->prior_cwnd = tp->snd_cwnd; tp->prr_delivered = 0; tp->prr_out = 0; - if (set_ssthresh) - tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); - TCP_ECN_queue_cwr(tp); + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + tcp_ecn_queue_cwr(tp); } static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, @@ -2528,14 +2541,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) } /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ -void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) +void tcp_enter_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tp->prior_ssthresh = 0; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; - tcp_init_cwnd_reduction(sk, set_ssthresh); + tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); } } @@ -2564,7 +2577,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) tp->retrans_stamp = 0; if (flag & FLAG_ECE) - tcp_enter_cwr(sk, 1); + tcp_enter_cwr(sk); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); @@ -2664,13 +2677,12 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->prior_ssthresh = 0; - tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out ? : -1; + tcp_init_undo(tp); if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); - tcp_init_cwnd_reduction(sk, true); + tcp_init_cwnd_reduction(sk); } tcp_set_ca_state(sk, TCP_CA_Recovery); } @@ -2680,7 +2692,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) */ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) { - struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); @@ -2706,12 +2717,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) if (recovered) { /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ - icsk->icsk_retransmits = 0; tcp_try_undo_recovery(sk); return; } - if (flag & FLAG_DATA_ACKED) - icsk->icsk_retransmits = 0; if (tcp_is_reno(tp)) { /* A Reno DUPACK means new data in F-RTO step 2.b above are * delivered. Lower inflight to clock out (re)tranmissions. @@ -2968,7 +2976,8 @@ void tcp_rearm_rto(struct sock *sk) if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); - const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; + const u32 rto_time_stamp = + tcp_skb_timestamp(skb) + rto; s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); /* delta may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. @@ -3043,10 +3052,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u8 sacked = scb->sacked; u32 acked_pcount; + if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) && + between(shinfo->tskey, prior_snd_una, tp->snd_una - 1)) + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || @@ -3203,9 +3217,10 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { + unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + when, TCP_RTO_MAX); } } @@ -3346,7 +3361,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) tp->tlp_high_seq = 0; /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ if (!(flag & FLAG_DSACKING_ACK)) { - tcp_init_cwnd_reduction(sk, true); + tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); tcp_try_keep_open(sk); @@ -3356,6 +3371,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } +static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->in_ack_event) + icsk->icsk_ca_ops->in_ack_event(sk, flags); +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3393,8 +3416,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); - if (after(ack, prior_snd_una)) + if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; + icsk->icsk_retransmits = 0; + } prior_fackets = tp->fackets_out; @@ -3413,10 +3438,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_ca_event(sk, CA_EVENT_FAST_ACK); + tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { + u32 ack_ev_flags = CA_ACK_SLOWPATH; + if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -3428,10 +3455,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_rtt_us); - if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; + ack_ev_flags |= CA_ACK_ECE; + } - tcp_ca_event(sk, CA_EVENT_SLOW_ACK); + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; + + tcp_in_ack_event(sk, ack_ev_flags); } /* We passed data and got it acked, remove any soft error @@ -4053,6 +4085,44 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } +/** + * tcp_try_coalesce - try to merge skb to prior one + * @sk: socket + * @to: prior buffer + * @from: buffer to add in queue + * @fragstolen: pointer to boolean + * + * Before queueing skb @from after @to, try to merge them + * to reduce overall memory use and queue lengths, if cost is small. + * Packets in ofo or receive queues can stay a long time. + * Better try to coalesce them right now to avoid future collapses. + * Returns true if caller should free @from instead of queueing it + */ +static bool tcp_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from, + bool *fragstolen) +{ + int delta; + + *fragstolen = false; + + /* Its possible this segment overlaps with prior segment in queue */ + if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) + return false; + + if (!skb_try_coalesce(to, from, fragstolen, &delta)) + return false; + + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); + TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; + TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; + TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; + return true; +} + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -4060,7 +4130,8 @@ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; - struct sk_buff *skb; + struct sk_buff *skb, *tail; + bool fragstolen, eaten; while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) @@ -4073,9 +4144,9 @@ static void tcp_ofo_queue(struct sock *sk) tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } + __skb_unlink(skb, &tp->out_of_order_queue); if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received\n"); - __skb_unlink(skb, &tp->out_of_order_queue); __kfree_skb(skb); continue; } @@ -4083,11 +4154,15 @@ static void tcp_ofo_queue(struct sock *sk) tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - __skb_unlink(skb, &tp->out_of_order_queue); - __skb_queue_tail(&sk->sk_receive_queue, skb); + tail = skb_peek_tail(&sk->sk_receive_queue); + eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if (tcp_hdr(skb)->fin) + if (!eaten) + __skb_queue_tail(&sk->sk_receive_queue, skb); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); + if (eaten) + kfree_skb_partial(skb, fragstolen); } } @@ -4114,53 +4189,13 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, return 0; } -/** - * tcp_try_coalesce - try to merge skb to prior one - * @sk: socket - * @to: prior buffer - * @from: buffer to add in queue - * @fragstolen: pointer to boolean - * - * Before queueing skb @from after @to, try to merge them - * to reduce overall memory use and queue lengths, if cost is small. - * Packets in ofo or receive queues can stay a long time. - * Better try to coalesce them right now to avoid future collapses. - * Returns true if caller should free @from instead of queueing it - */ -static bool tcp_try_coalesce(struct sock *sk, - struct sk_buff *to, - struct sk_buff *from, - bool *fragstolen) -{ - int delta; - - *fragstolen = false; - - if (tcp_hdr(from)->fin) - return false; - - /* Its possible this segment overlaps with prior segment in queue */ - if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) - return false; - - if (!skb_try_coalesce(to, from, fragstolen, &delta)) - return false; - - atomic_add(delta, &sk->sk_rmem_alloc); - sk_mem_charge(sk, delta); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); - TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; - TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; - return true; -} - static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb1; u32 seq, end_seq; - TCP_ECN_check_ce(tp, skb); + tcp_ecn_check_ce(tp, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); @@ -4299,24 +4334,19 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { - struct sk_buff *skb = NULL; - struct tcphdr *th; + struct sk_buff *skb; bool fragstolen; if (size == 0) return 0; - skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); + skb = alloc_skb(size, sk->sk_allocation); if (!skb) goto err; - if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto err_free; - th = (struct tcphdr *)skb_put(skb, sizeof(*th)); - skb_reset_transport_header(skb); - memset(th, 0, sizeof(*th)); - if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) goto err_free; @@ -4324,7 +4354,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; - if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { + if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { WARN_ON_ONCE(fragstolen); /* should not happen */ __kfree_skb(skb); } @@ -4338,7 +4368,6 @@ err: static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { - const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; bool fragstolen = false; @@ -4347,9 +4376,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) goto drop; skb_dst_drop(skb); - __skb_pull(skb, th->doff * 4); + __skb_pull(skb, tcp_hdr(skb)->doff * 4); - TCP_ECN_accept_cwr(tp, skb); + tcp_ecn_accept_cwr(tp, skb); tp->rx_opt.dsack = 0; @@ -4391,7 +4420,7 @@ queue_and_out: tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (skb->len) tcp_event_data_recv(sk, skb); - if (th->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); if (!skb_queue_empty(&tp->out_of_order_queue)) { @@ -4506,7 +4535,7 @@ restart: * - bloated or contains data before "start" or * overlaps to the next one. */ - if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && + if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start))) { end_of_skbs = false; @@ -4525,30 +4554,18 @@ restart: /* Decided to skip this, advance start seq. */ start = TCP_SKB_CB(skb)->end_seq; } - if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) + if (end_of_skbs || + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; while (before(start, end)) { + int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); struct sk_buff *nskb; - unsigned int header = skb_headroom(skb); - int copy = SKB_MAX_ORDER(header, 0); - /* Too big header? This can happen with IPv6. */ - if (copy < 0) - return; - if (end - start < copy) - copy = end - start; - nskb = alloc_skb(copy + header, GFP_ATOMIC); + nskb = alloc_skb(copy, GFP_ATOMIC); if (!nskb) return; - skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); - skb_set_network_header(nskb, (skb_network_header(skb) - - skb->head)); - skb_set_transport_header(nskb, (skb_transport_header(skb) - - skb->head)); - skb_reserve(nskb, header); - memcpy(nskb->head, skb->head, header); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; __skb_queue_before(list, skb, nskb); @@ -4572,8 +4589,7 @@ restart: skb = tcp_collapse_one(sk, skb, list); if (!skb || skb == tail || - tcp_hdr(skb)->syn || - tcp_hdr(skb)->fin) + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; } } @@ -4941,53 +4957,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk, __tcp_checksum_complete_user(sk, skb); } -#ifdef CONFIG_NET_DMA -static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, - int hlen) -{ - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; - int dma_cookie; - bool copied_early = false; - - if (tp->ucopy.wakeup) - return false; - - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - - if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { - - dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, - skb, hlen, - tp->ucopy.iov, chunk, - tp->ucopy.pinned_list); - - if (dma_cookie < 0) - goto out; - - tp->ucopy.dma_cookie = dma_cookie; - copied_early = true; - - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - tcp_rcv_space_adjust(sk); - - if ((tp->ucopy.len == 0) || - (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || - (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { - tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk); - } - } else if (chunk > 0) { - tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk); - } -out: - return copied_early; -} -#endif /* CONFIG_NET_DMA */ - /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ @@ -5167,27 +5136,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else { int eaten = 0; - int copied_early = 0; bool fragstolen = false; - if (tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len) { -#ifdef CONFIG_NET_DMA - if (tp->ucopy.task == current && - sock_owned_by_user(sk) && - tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { - copied_early = 1; - eaten = 1; - } -#endif - if (tp->ucopy.task == current && - sock_owned_by_user(sk) && !copied_early) { - __set_current_state(TASK_RUNNING); + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len && + sock_owned_by_user(sk)) { + __set_current_state(TASK_RUNNING); - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) - eaten = 1; - } - if (eaten) { + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -5203,9 +5160,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); + eaten = 1; } - if (copied_early) - tcp_cleanup_rbuf(sk, skb->len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) @@ -5242,14 +5198,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto no_ack; } - if (!copied_early || tp->rcv_nxt != tp->rcv_wup) - __tcp_ack_snd_check(sk, 0); + __tcp_ack_snd_check(sk, 0); no_ack: -#ifdef CONFIG_NET_DMA - if (copied_early) - __skb_queue_tail(&sk->sk_async_wait_queue, skb); - else -#endif if (eaten) kfree_skb_partial(skb, fragstolen); sk->sk_data_ready(sk); @@ -5443,7 +5393,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * state to ESTABLISHED..." */ - TCP_ECN_rcv_synack(tp, th); + tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_ack(sk, skb, FLAG_SLOWPATH); @@ -5562,7 +5512,7 @@ discard: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - TCP_ECN_rcv_syn(tp, th); + tcp_ecn_rcv_syn(tp, th); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -5877,3 +5827,190 @@ discard: return 0; } EXPORT_SYMBOL(tcp_rcv_state_process); + +static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + if (family == AF_INET) + LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), + &ireq->ir_rmt_addr, port); +#if IS_ENABLED(CONFIG_IPV6) + else if (family == AF_INET6) + LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"), + &ireq->ir_v6_rmt_addr, port); +#endif +} + +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set + * + * If we receive a SYN packet with these bits set, it means a + * network is playing bad games with TOS bits. In order to + * avoid possible false congestion notifications, we disable + * TCP ECN negociation. + * + * Exception: tcp_ca wants ECN. This is required for DCTCP + * congestion control; it requires setting ECT on all packets, + * including SYN. We inverse the test in this case: If our + * local socket wants ECN, but peer only set ece/cwr (but not + * ECT in IP header) its probably a non-DCTCP aware sender. + */ +static void tcp_ecn_create_request(struct request_sock *req, + const struct sk_buff *skb, + const struct sock *listen_sk) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct net *net = sock_net(listen_sk); + bool th_ecn = th->ece && th->cwr; + bool ect, need_ecn; + + if (!th_ecn) + return; + + ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); + need_ecn = tcp_ca_needs_ecn(listen_sk); + + if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + inet_rsk(req)->ecn_ok = 1; + else if (ect && need_ecn) + inet_rsk(req)->ecn_ok = 1; +} + +int tcp_conn_request(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct sk_buff *skb) +{ + struct tcp_options_received tmp_opt; + struct request_sock *req; + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = NULL; + __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; + bool want_cookie = false, fastopen; + struct flowi fl; + struct tcp_fastopen_cookie foc = { .len = -1 }; + int err; + + + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if ((sysctl_tcp_syncookies == 2 || + inet_csk_reqsk_queue_is_full(sk)) && !isn) { + want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); + if (!want_cookie) + goto drop; + } + + + /* Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + goto drop; + } + + req = inet_reqsk_alloc(rsk_ops); + if (!req) + goto drop; + + tcp_rsk(req)->af_specific = af_ops; + + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = af_ops->mss_clamp; + tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); + + if (want_cookie && !tmp_opt.saw_tstamp) + tcp_clear_options(&tmp_opt); + + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + tcp_openreq_init(req, &tmp_opt, skb, sk); + + af_ops->init_req(req, sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; + + if (!want_cookie || tmp_opt.tstamp_ok) + tcp_ecn_create_request(req, skb, sk); + + if (want_cookie) { + isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + } else if (!isn) { + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tcp_death_row.sysctl_tw_recycle) { + bool strict; + + dst = af_ops->route_req(sk, &fl, req, &strict); + + if (dst && strict && + !tcp_peer_is_proven(req, dst, true, + tmp_opt.saw_tstamp)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); + goto drop_and_release; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (sysctl_max_syn_backlog >> 2)) && + !tcp_peer_is_proven(req, dst, false, + tmp_opt.saw_tstamp)) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + pr_drop_req(req, ntohs(tcp_hdr(skb)->source), + rsk_ops->family); + goto drop_and_release; + } + + isn = af_ops->init_seq(skb); + } + if (!dst) { + dst = af_ops->route_req(sk, &fl, req, NULL); + if (!dst) + goto drop_and_free; + } + + tcp_rsk(req)->snt_isn = isn; + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = af_ops->send_synack(sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { + if (err || want_cookie) + goto drop_and_free; + + tcp_rsk(req)->listener = NULL; + af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } + + return 0; + +drop_and_release: + dst_release(dst); +drop_and_free: + reqsk_free(req); +drop: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + return 0; +} +EXPORT_SYMBOL(tcp_conn_request); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 77cccda1ad0c..552e87e3c269 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -72,7 +72,6 @@ #include <net/inet_common.h> #include <net/timewait_sock.h> #include <net/xfrm.h> -#include <net/netdma.h> #include <net/secure_seq.h> #include <net/tcp_memcontrol.h> #include <net/busy_poll.h> @@ -90,7 +89,6 @@ int sysctl_tcp_tw_reuse __read_mostly; int sysctl_tcp_low_latency __read_mostly; EXPORT_SYMBOL(sysctl_tcp_low_latency); - #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -99,7 +97,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) +static __u32 tcp_v4_init_sequence(const struct sk_buff *skb) { return secure_tcp_sequence_number(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, @@ -208,6 +206,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; + inet_set_txhash(sk); + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; @@ -269,7 +269,7 @@ EXPORT_SYMBOL(tcp_v4_connect); * It can be called through tcp_release_cb() if socket was owned by user * at the time tcp_v4_err() was called to handle ICMP message. */ -static void tcp_v4_mtu_reduced(struct sock *sk) +void tcp_v4_mtu_reduced(struct sock *sk) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); @@ -300,6 +300,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk) tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ } +EXPORT_SYMBOL(tcp_v4_mtu_reduced); static void do_redirect(struct sk_buff *skb, struct sock *sk) { @@ -342,11 +343,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) int err; struct net *net = dev_net(icmp_skb->dev); - if (icmp_skb->len < (iph->ihl << 2) + 8) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); - return; - } - sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, th->source, inet_iif(icmp_skb)); if (!sk) { @@ -433,15 +429,16 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) break; icsk->icsk_backoff--; - inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : - TCP_TIMEOUT_INIT) << icsk->icsk_backoff; - tcp_bound_rto(sk); + icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : + TCP_TIMEOUT_INIT; + icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); skb = tcp_write_queue_head(sk); BUG_ON(!skb); - remaining = icsk->icsk_rto - min(icsk->icsk_rto, - tcp_time_stamp - TCP_SKB_CB(skb)->when); + remaining = icsk->icsk_rto - + min(icsk->icsk_rto, + tcp_time_stamp - tcp_skb_timestamp(skb)); if (remaining) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, @@ -683,8 +680,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; - ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); @@ -766,8 +764,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } @@ -814,6 +813,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * socket. */ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct flowi *fl, struct request_sock *req, u16 queue_mapping, struct tcp_fastopen_cookie *foc) @@ -837,24 +837,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, ireq->ir_rmt_addr, ireq->opt); err = net_xmit_eval(err); - if (!tcp_rsk(req)->snt_synack && !err) - tcp_rsk(req)->snt_synack = tcp_time_stamp; } return err; } -static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req) -{ - int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL); - - if (!res) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - } - return res; -} - /* * IPv4 request_sock destructor. */ @@ -898,18 +885,16 @@ EXPORT_SYMBOL(tcp_syn_flood_action); */ static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) { - const struct ip_options *opt = &(IPCB(skb)->opt); + const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; if (opt && opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); - if (dopt) { - if (ip_options_echo(&dopt->opt, skb)) { - kfree(dopt); - dopt = NULL; - } + if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { + kfree(dopt); + dopt = NULL; } } return dopt; @@ -1064,7 +1049,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, if (sin->sin_family != AF_INET) return -EINVAL; - if (!cmd.tcpm_key || !cmd.tcpm_keylen) + if (!cmd.tcpm_keylen) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, AF_INET); @@ -1182,7 +1167,8 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); -static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +static bool __tcp_v4_inbound_md5_hash(struct sock *sk, + const struct sk_buff *skb) { /* * This gets called for each TCP segment that arrives @@ -1235,163 +1221,81 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) return false; } +static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +{ + bool ret; + + rcu_read_lock(); + ret = __tcp_v4_inbound_md5_hash(sk, skb); + rcu_read_unlock(); + + return ret; +} + #endif +static void tcp_v4_init_req(struct request_sock *req, struct sock *sk, + struct sk_buff *skb) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + ireq->ir_loc_addr = ip_hdr(skb)->daddr; + ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + ireq->no_srccheck = inet_sk(sk)->transparent; + ireq->opt = tcp_v4_save_options(skb); +} + +static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, + const struct request_sock *req, + bool *strict) +{ + struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req); + + if (strict) { + if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr) + *strict = true; + else + *strict = false; + } + + return dst; +} + struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), - .rtx_syn_ack = tcp_v4_rtx_synack, + .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, - .syn_ack_timeout = tcp_syn_ack_timeout, + .syn_ack_timeout = tcp_syn_ack_timeout, }; -#ifdef CONFIG_TCP_MD5SIG static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { + .mss_clamp = TCP_MSS_DEFAULT, +#ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_reqsk_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, -}; #endif + .init_req = tcp_v4_init_req, +#ifdef CONFIG_SYN_COOKIES + .cookie_init_seq = cookie_v4_init_sequence, +#endif + .route_req = tcp_v4_route_req, + .init_seq = tcp_v4_init_sequence, + .send_synack = tcp_v4_send_synack, + .queue_hash_add = inet_csk_reqsk_queue_hash_add, +}; int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp_options_received tmp_opt; - struct request_sock *req; - struct inet_request_sock *ireq; - struct tcp_sock *tp = tcp_sk(sk); - struct dst_entry *dst = NULL; - __be32 saddr = ip_hdr(skb)->saddr; - __be32 daddr = ip_hdr(skb)->daddr; - __u32 isn = TCP_SKB_CB(skb)->when; - bool want_cookie = false, fastopen; - struct flowi4 fl4; - struct tcp_fastopen_cookie foc = { .len = -1 }; - int err; - /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - if ((sysctl_tcp_syncookies == 2 || - inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); - if (!want_cookie) - goto drop; - } + return tcp_conn_request(&tcp_request_sock_ops, + &tcp_request_sock_ipv4_ops, sk, skb); - /* Accept backlog is full. If we have already queued enough - * of warm entries in syn queue, drop request. It is better than - * clogging syn queue with openreqs with exponentially increasing - * timeout. - */ - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); - goto drop; - } - - req = inet_reqsk_alloc(&tcp_request_sock_ops); - if (!req) - goto drop; - -#ifdef CONFIG_TCP_MD5SIG - tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; -#endif - - tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = TCP_MSS_DEFAULT; - tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); - - if (want_cookie && !tmp_opt.saw_tstamp) - tcp_clear_options(&tmp_opt); - - tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb); - - ireq = inet_rsk(req); - ireq->ir_loc_addr = daddr; - ireq->ir_rmt_addr = saddr; - ireq->no_srccheck = inet_sk(sk)->transparent; - ireq->opt = tcp_v4_save_options(skb); - ireq->ir_mark = inet_request_mark(sk, skb); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sock_net(sk)); - - if (want_cookie) { - isn = cookie_v4_init_sequence(sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - } else if (!isn) { - /* VJ's idea. We save last timestamp seen - * from the destination in peer table, when entering - * state TIME-WAIT, and check against it before - * accepting new connection request. - * - * If "isn" is not zero, this request hit alive - * timewait bucket, so that all the necessary checks - * are made in the function processing timewait state. - */ - if (tmp_opt.saw_tstamp && - tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && - fl4.daddr == saddr) { - if (!tcp_peer_is_proven(req, dst, true)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); - goto drop_and_release; - } - } - /* Kill the following clause, if you dislike this way. */ - else if (!sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst, false)) { - /* Without syncookies last quarter of - * backlog is filled with destinations, - * proven to be alive. - * It means that we continue to communicate - * to destinations, already remembered - * to the moment of synflood. - */ - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), - &saddr, ntohs(tcp_hdr(skb)->source)); - goto drop_and_release; - } - - isn = tcp_v4_init_sequence(skb); - } - if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) - goto drop_and_free; - - tcp_rsk(req)->snt_isn = isn; - tcp_rsk(req)->snt_synack = tcp_time_stamp; - tcp_openreq_init_rwin(req, sk, dst); - fastopen = !want_cookie && - tcp_try_fastopen(sk, skb, req, &foc, dst); - err = tcp_v4_send_synack(sk, dst, req, - skb_get_queue_mapping(skb), &foc); - if (!fastopen) { - if (err || want_cookie) - goto drop_and_free; - - tcp_rsk(req)->snt_synack = tcp_time_stamp; - tcp_rsk(req)->listener = NULL; - inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - } - - return 0; - -drop_and_release: - dst_release(dst); -drop_and_free: - reqsk_free(req); drop: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return 0; @@ -1439,6 +1343,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; inet_csk(newsk)->icsk_ext_hdr_len = 0; + inet_set_txhash(newsk); if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; @@ -1523,7 +1428,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_SYN_COOKIES if (!th->syn) - sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); + sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt); #endif return sk; } @@ -1539,16 +1444,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; -#ifdef CONFIG_TCP_MD5SIG - /* - * We really want to reject the packet as early as possible - * if: - * o We're expecting an MD5'd packet and this is no MD5 tcp option - * o There is an MD5 option and we're not expecting one - */ - if (tcp_v4_inbound_md5_hash(sk, skb)) - goto discard; -#endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst = sk->sk_rx_dst; @@ -1663,7 +1558,17 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) skb_queue_len(&tp->ucopy.prequeue) == 0) return false; - skb_dst_force(skb); + /* Before escaping RCU protected region, we need to take care of skb + * dst. Prequeue is only enabled for established sockets. + * For such sockets, we might need the skb dst only to set sk->sk_rx_dst + * Instead of doing full sk_rx_dst validity here, let's perform + * an optimistic check. + */ + if (likely(sk->sk_rx_dst)) + skb_dst_drop(skb); + else + skb_dst_force(skb); + __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; if (tp->ucopy.memory > sk->sk_rcvbuf) { @@ -1728,11 +1633,19 @@ int tcp_v4_rcv(struct sk_buff *skb) th = tcp_hdr(skb); iph = ip_hdr(skb); + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), + sizeof(struct inet_skb_parm)); + barrier(); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; @@ -1751,6 +1664,18 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; + +#ifdef CONFIG_TCP_MD5SIG + /* + * We really want to reject the packet as early as possible + * if: + * o We're expecting an MD5'd packet and this is no MD5 tcp option + * o There is an MD5 option and we're not expecting one + */ + if (tcp_v4_inbound_md5_hash(sk, skb)) + goto discard_and_relse; +#endif + nf_reset(skb); if (sk_filter(sk, skb)) @@ -1762,18 +1687,8 @@ process: bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { -#ifdef CONFIG_NET_DMA - struct tcp_sock *tp = tcp_sk(sk); - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - if (tp->ucopy.dma_chan) + if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); - else -#endif - { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v4_do_rcv(sk, skb); - } } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); @@ -1857,9 +1772,11 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - dst_hold(dst); - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (dst) { + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + } } EXPORT_SYMBOL(inet_sk_rx_dst_set); @@ -1880,6 +1797,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, #endif + .mtu_reduced = tcp_v4_mtu_reduced, }; EXPORT_SYMBOL(ipv4_specific); @@ -1932,11 +1850,6 @@ void tcp_v4_destroy_sock(struct sock *sk) } #endif -#ifdef CONFIG_NET_DMA - /* Cleans up our sk_async_wait_queue */ - __skb_queue_purge(&sk->sk_async_wait_queue); -#endif - /* Clean prequeue, it must be empty really */ __skb_queue_purge(&tp->ucopy.prequeue); @@ -2274,7 +2187,7 @@ int tcp_seq_open(struct inode *inode, struct file *file) s = ((struct seq_file *)file->private_data)->private; s->family = afinfo->family; - s->last_pos = 0; + s->last_pos = 0; return 0; } EXPORT_SYMBOL(tcp_seq_open); @@ -2499,7 +2412,6 @@ struct proto tcp_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, - .mtu_reduced = tcp_v4_mtu_reduced, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index f7a2ec3ac584..1d191357bf88 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -32,7 +32,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) res_parent = &parent_cg->memory_allocated; res_counter_init(&cg_proto->memory_allocated, res_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0); + percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); return 0; } @@ -222,7 +222,7 @@ static struct cftype tcp_files[] = { static int __init tcp_memcontrol_init(void) { - WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files)); + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files)); return 0; } __initcall(tcp_memcontrol_init); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4fe041805989..ed9c9a91851c 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -576,7 +576,8 @@ reset: tp->snd_cwnd_stamp = tcp_time_stamp; } -bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, + bool paws_check, bool timestamps) { struct tcp_metrics_block *tm; bool ret; @@ -589,7 +590,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool pa if (paws_check) { if (tm && (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && - (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) + ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW || + !timestamps)) ret = false; else ret = true; @@ -1093,7 +1095,6 @@ static const struct genl_ops tcp_metrics_nl_ops[] = { .doit = tcp_metrics_nl_cmd_get, .dumpit = tcp_metrics_nl_dump, .policy = tcp_metrics_nl_policy, - .flags = GENL_ADMIN_PERM, }, { .cmd = TCP_METRICS_CMD_DEL, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e68e0d4af6c9..63d2680b65db 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -232,7 +232,7 @@ kill: u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; - TCP_SKB_CB(skb)->when = isn; + TCP_SKB_CB(skb)->tcp_tw_isn = isn; return TCP_TW_SYN; } @@ -298,7 +298,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = np->flow_label >> 12; - tw->tw_ipv6only = np->ipv6only; + tw->tw_ipv6only = sk->sk_ipv6only; } #endif @@ -393,8 +393,8 @@ void tcp_openreq_init_rwin(struct request_sock *req, } EXPORT_SYMBOL(tcp_openreq_init_rwin); -static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, - struct request_sock *req) +static void tcp_ecn_openreq_child(struct tcp_sock *tp, + const struct request_sock *req) { tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; } @@ -451,9 +451,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; - if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && - !try_module_get(newicsk->icsk_ca_ops->owner)) - newicsk->icsk_ca_ops = &tcp_init_congestion_ops; + if (!try_module_get(newicsk->icsk_ca_ops->owner)) + tcp_assign_congestion_control(newsk); tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); @@ -508,7 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; - TCP_ECN_openreq_child(newtp, req); + tcp_ecn_openreq_child(newtp, req); newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 55046ecd083e..5b90f2f447a5 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -14,6 +14,43 @@ #include <net/tcp.h> #include <net/protocol.h> +static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, + unsigned int seq, unsigned int mss) +{ + while (skb) { + if (before(ts_seq, seq + mss)) { + skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP; + skb_shinfo(skb)->tskey = ts_seq; + return; + } + + skb = skb->next; + seq += mss; + } +} + +struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) + return ERR_PTR(-EINVAL); + + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + + /* Set up checksum pseudo header, usually expect stack to + * have done this already. + */ + + th->check = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v4_send_check(skb, iph->saddr, iph->daddr); + } + + return tcp_gso_segment(skb, features); +} + struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -29,9 +66,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, __sum16 newcheck; bool ooo_okay, copy_destructor; - if (!pskb_may_pull(skb, sizeof(*th))) - goto out; - th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) @@ -91,6 +125,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th = tcp_hdr(skb); seq = ntohl(th->seq); + if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP)) + tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss); + newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); @@ -251,54 +288,16 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static int tcp_v4_gso_send_check(struct sk_buff *skb) -{ - const struct iphdr *iph; - struct tcphdr *th; - - if (!pskb_may_pull(skb, sizeof(*th))) - return -EINVAL; - - iph = ip_hdr(skb); - th = tcp_hdr(skb); - - th->check = 0; - skb->ip_summed = CHECKSUM_PARTIAL; - __tcp_v4_send_check(skb, iph->saddr, iph->daddr); - return 0; -} - static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - /* Use the IP hdr immediately proceeding for this transport */ - const struct iphdr *iph = skb_gro_network_header(skb); - __wsum wsum; - /* Don't bother verifying checksum if we're going to flush anyway. */ - if (NAPI_GRO_CB(skb)->flush) - goto skip_csum; - - wsum = NAPI_GRO_CB(skb)->csum; - - switch (skb->ip_summed) { - case CHECKSUM_NONE: - wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), - 0); - - /* fall through */ - - case CHECKSUM_COMPLETE: - if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, - wsum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - + if (!NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_validate(skb, IPPROTO_TCP, + inet_gro_compute_pseudo)) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } -skip_csum: return tcp_gro_receive(head, skb); } @@ -316,8 +315,7 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff) static const struct net_offload tcpv4_offload = { .callbacks = { - .gso_send_check = tcp_v4_gso_send_check, - .gso_segment = tcp_gso_segment, + .gso_segment = tcp4_gso_segment, .gro_receive = tcp4_gro_receive, .gro_complete = tcp4_gro_complete, }, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 179b51e6bda3..8d4eac793700 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -318,36 +318,47 @@ static u16 tcp_select_window(struct sock *sk) } /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { + const struct tcp_sock *tp = tcp_sk(sk); + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; + else if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } /* Packet ECN state for a SYN. */ -static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) +static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || + tcp_ca_needs_ecn(sk)) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); } } -static __inline__ void -TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) +static void +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, + struct sock *sk) { - if (inet_rsk(req)->ecn_ok) + if (inet_rsk(req)->ecn_ok) { th->ece = 1; + if (tcp_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); + } } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ -static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, +static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); @@ -362,7 +373,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, tcp_hdr(skb)->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } - } else { + } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } @@ -384,7 +395,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; - shinfo->gso_segs = 1; + tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; shinfo->gso_type = 0; @@ -550,7 +561,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset; + opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -618,7 +629,7 @@ static unsigned int tcp_synack_options(struct sock *sk, } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsval = tcp_skb_timestamp(skb); opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } @@ -647,7 +658,6 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { - struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks; @@ -666,7 +676,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; - opts->tsval = tcb ? tcb->when + tp->tsoffset : 0; + opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } @@ -800,7 +810,7 @@ void tcp_release_cb(struct sock *sk) __sock_put(sk); } if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { - sk->sk_prot->mtu_reduced(sk); + inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } } @@ -886,8 +896,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; - /* Our usage of tstamp should remain private */ - skb->tstamp.tv64 = 0; } inet = inet_sk(sk); @@ -916,6 +924,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_orphan(skb); skb->sk = sk; skb->destructor = tcp_wfree; + skb_set_hash_from_sk(skb, sk); atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ @@ -951,7 +960,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_options_write((__be32 *)(th + 1), tp, &opts); if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) - TCP_ECN_send(sk, skb, tcp_header_size); + tcp_ecn_send(sk, skb, tcp_header_size); #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ @@ -974,11 +983,22 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); + /* OK, its time to fill skb_shinfo(skb)->gso_segs */ + skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); + + /* Our usage of tstamp should remain private */ + skb->tstamp.tv64 = 0; + + /* Cleanup our debris for IP stacks */ + memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), + sizeof(struct inet6_skb_parm))); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + if (likely(err <= 0)) return err; - tcp_enter_cwr(sk, 1); + tcp_enter_cwr(sk); return net_xmit_eval(err); } @@ -994,7 +1014,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; - skb_header_release(skb); + __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); @@ -1013,11 +1033,11 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, /* Avoid the costly divide in the normal * non-TSO case. */ - shinfo->gso_segs = 1; + tcp_skb_pcount_set(skb, 1); shinfo->gso_size = 0; shinfo->gso_type = 0; } else { - shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); + tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); shinfo->gso_size = mss_now; shinfo->gso_type = sk->sk_gso_type; } @@ -1068,6 +1088,21 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de tcp_verify_left_out(tp); } +static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) && + !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { + struct skb_shared_info *shinfo2 = skb_shinfo(skb2); + u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; + + shinfo->tx_flags &= ~tsflags; + shinfo2->tx_flags |= tsflags; + swap(shinfo->tskey, shinfo2->tskey); + } +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. @@ -1130,11 +1165,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, buff->ip_summed = skb->ip_summed; - /* Looks stupid, but our code really uses when of - * skbs, which it never sent before. --ANK - */ - TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; buff->tstamp = skb->tstamp; + tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); @@ -1154,7 +1186,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, } /* Link BUFF into the send queue. */ - skb_header_release(buff); + __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); return 0; @@ -1651,13 +1683,14 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); + tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(sk, skb, mss_now); tcp_set_skb_tso_segs(sk, buff, mss_now); /* Link BUFF into the send queue. */ - skb_header_release(buff); + __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); return 0; @@ -1856,8 +1889,8 @@ static int tcp_mtu_probe(struct sock *sk) tcp_init_tso_segs(sk, nskb, nskb->len); /* We're ready to send. If this fails, the probe will - * be resegmented into mss-sized pieces by tcp_write_xmit(). */ - TCP_SKB_CB(nskb)->when = tcp_time_stamp; + * be resegmented into mss-sized pieces by tcp_write_xmit(). + */ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ @@ -1916,8 +1949,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); - if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) + if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { + /* "skb_mstamp" is used as a start point for the retransmit timer */ + skb_mstamp_get(&skb->skb_mstamp); goto repair; /* Skip network transmission */ + } cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) { @@ -1979,8 +2015,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; - TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2076,10 +2110,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { - const struct sk_buff *fclone = skb + 1; - - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) { + if (unlikely(skb_fclone_busy(skb))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; @@ -2478,7 +2509,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom @@ -2523,7 +2553,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Save stamp of the first retransmit. */ if (!tp->retrans_stamp) - tp->retrans_stamp = TCP_SKB_CB(skb)->when; + tp->retrans_stamp = tcp_skb_timestamp(skb); /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). @@ -2731,7 +2761,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); /* Send it off. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -2759,7 +2788,7 @@ int tcp_send_synack(struct sock *sk) if (nskb == NULL) return -ENOMEM; tcp_unlink_write_queue(skb, sk); - skb_header_release(nskb); + __skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); sk_wmem_free_skb(sk, skb); sk->sk_wmem_queued += nskb->truesize; @@ -2768,9 +2797,8 @@ int tcp_send_synack(struct sock *sk) } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; - TCP_ECN_send_synack(tcp_sk(sk), skb); + tcp_ecn_send_synack(sk, skb); } - TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2814,10 +2842,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(&opts, 0, sizeof(opts)); #ifdef CONFIG_SYN_COOKIES if (unlikely(req->cookie_ts)) - TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); + skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req); else #endif - TCP_SKB_CB(skb)->when = tcp_time_stamp; + skb_mstamp_get(&skb->skb_mstamp); tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, foc) + sizeof(*th); @@ -2828,7 +2856,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - TCP_ECN_make_synack(req, th); + tcp_ecn_make_synack(req, th, sk); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; /* Setting of flags are superfluous here for callers (and ECE is @@ -2935,7 +2963,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->end_seq += skb->len; - skb_header_release(skb); + __skb_header_release(skb); __tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); @@ -3065,9 +3093,9 @@ int tcp_connect(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); - tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->retrans_stamp = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); - TCP_ECN_send_syn(sk, buff); + tcp_ecn_send_syn(sk, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : @@ -3099,6 +3127,8 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; + tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); + if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3155,6 +3185,8 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; + tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); + /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. @@ -3173,9 +3205,10 @@ void tcp_send_ack(struct sock *sk) tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->when = tcp_time_stamp; + skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } +EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. @@ -3205,7 +3238,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); - TCP_SKB_CB(skb)->when = tcp_time_stamp; + skb_mstamp_get(&skb->skb_mstamp); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -3249,7 +3282,6 @@ int tcp_write_wakeup(struct sock *sk) tcp_set_skb_tso_segs(sk, skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); @@ -3268,6 +3300,7 @@ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + unsigned long probe_max; int err; err = tcp_write_wakeup(sk); @@ -3283,9 +3316,7 @@ void tcp_send_probe0(struct sock *sk) if (icsk->icsk_backoff < sysctl_tcp_retries2) icsk->icsk_backoff++; icsk->icsk_probes_out++; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + probe_max = TCP_RTO_MAX; } else { /* If packet was not sent due to local congestion, * do not backoff and do not remember icsk_probes_out. @@ -3295,9 +3326,24 @@ void tcp_send_probe0(struct sock *sk) */ if (!icsk->icsk_probes_out) icsk->icsk_probes_out = 1; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, - TCP_RESOURCE_PROBE_INTERVAL), - TCP_RTO_MAX); + probe_max = TCP_RESOURCE_PROBE_INTERVAL; + } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + inet_csk_rto_backoff(icsk, probe_max), + TCP_RTO_MAX); +} + +int tcp_rtx_synack(struct sock *sk, struct request_sock *req) +{ + const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; + struct flowi fl; + int res; + + res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); + if (!res) { + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); } + return res; } +EXPORT_SYMBOL(tcp_rtx_synack); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 3b66610d4156..ebf5ff57526e 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -83,7 +83,6 @@ static struct { struct tcp_log *log; } tcp_probe; - static inline int tcp_probe_used(void) { return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); @@ -101,7 +100,6 @@ static inline int tcp_probe_avail(void) si4.sin_addr.s_addr = inet->inet_##mem##addr; \ } while (0) \ - /* * Hook inserted to be called before each receive packet. * Note: arguments must match tcp_rcv_established()! @@ -194,8 +192,8 @@ static int tcpprobe_sprint(char *tbuf, int n) return scnprintf(tbuf, n, "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", - (unsigned long) tv.tv_sec, - (unsigned long) tv.tv_nsec, + (unsigned long)tv.tv_sec, + (unsigned long)tv.tv_nsec, &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 8250949b8853..6824afb65d93 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -31,10 +31,10 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) static u32 tcp_scalable_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); } - static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, .cong_avoid = tcp_scalable_cong_avoid, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 286227abed10..9b21ae8b2e31 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -52,7 +52,7 @@ static void tcp_write_err(struct sock *sk) * limit. * 2. If we have strong memory pressure. */ -static int tcp_out_of_resources(struct sock *sk, int do_reset) +static int tcp_out_of_resources(struct sock *sk, bool do_reset) { struct tcp_sock *tp = tcp_sk(sk); int shift = 0; @@ -72,7 +72,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) - do_reset = 1; + do_reset = true; if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); @@ -135,10 +135,9 @@ static bool retransmits_timed_out(struct sock *sk, if (!inet_csk(sk)->icsk_retransmits) return false; - if (unlikely(!tcp_sk(sk)->retrans_stamp)) - start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; - else - start_ts = tcp_sk(sk)->retrans_stamp; + start_ts = tcp_sk(sk)->retrans_stamp; + if (unlikely(!start_ts)) + start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); @@ -181,7 +180,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = (icsk->icsk_rto < TCP_RTO_MAX); + const int alive = icsk->icsk_rto < TCP_RTO_MAX; retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -271,40 +270,41 @@ static void tcp_probe_timer(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; + u32 start_ts; if (tp->packets_out || !tcp_send_head(sk)) { icsk->icsk_probes_out = 0; return; } - /* *WARNING* RFC 1122 forbids this - * - * It doesn't AFAIK, because we kill the retransmit timer -AK - * - * FIXME: We ought not to do it, Solaris 2.5 actually has fixing - * this behaviour in Solaris down as a bug fix. [AC] - * - * Let me to explain. icsk_probes_out is zeroed by incoming ACKs - * even if they advertise zero window. Hence, connection is killed only - * if we received no ACKs for normal connection timeout. It is not killed - * only because window stays zero for some time, window may be zero - * until armageddon and even later. We are in full accordance - * with RFCs, only probe timer combines both retransmission timeout - * and probe timeout in one bottle. --ANK + /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as + * long as the receiver continues to respond probes. We support this by + * default and reset icsk_probes_out with incoming ACKs. But if the + * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we + * kill the socket when the retry count and the time exceeds the + * corresponding system limit. We also implement similar policy when + * we use RTO to probe window in tcp_retransmit_timer(). */ - max_probes = sysctl_tcp_retries2; + start_ts = tcp_skb_timestamp(tcp_send_head(sk)); + if (!start_ts) + skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp); + else if (icsk->icsk_user_timeout && + (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) + goto abort; + max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); + const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); - - if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) + if (!alive && icsk->icsk_backoff >= max_probes) + goto abort; + if (tcp_out_of_resources(sk, true)) return; } if (icsk->icsk_probes_out > max_probes) { - tcp_write_err(sk); +abort: tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); @@ -391,7 +391,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_write_err(sk); goto out; } - tcp_enter_loss(sk, 0); + tcp_enter_loss(sk); tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); __sk_dst_reset(sk); goto out_reset_timer; @@ -422,7 +422,7 @@ void tcp_retransmit_timer(struct sock *sk) NET_INC_STATS_BH(sock_net(sk), mib_idx); } - tcp_enter_loss(sk, 0); + tcp_enter_loss(sk); if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { /* Retransmission failed because of local congestion, diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 9a5e05f27f4f..a6afde666ab1 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -51,7 +51,6 @@ MODULE_PARM_DESC(beta, "upper bound of packets in network"); module_param(gamma, int, 0644); MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); - /* There are several situations when we must "re-start" Vegas: * * o when a connection is established @@ -133,7 +132,6 @@ EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked); void tcp_vegas_state(struct sock *sk, u8 ca_state) { - if (ca_state == TCP_CA_Open) vegas_enable(sk); else @@ -218,7 +216,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) * This is: * (actual rate in segments) * baseRTT */ - target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt; + target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT; + do_div(target_cwnd, rtt); /* Calculate the difference between the window we had, * and the window we would like to have. This quantity @@ -284,7 +283,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) /* Use normal slow start */ else if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp, acked); - } /* Extract info for Tcp socket info provided via netlink. */ diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 27b9825753d1..a4d2d2d88dca 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -144,7 +144,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) rtt = veno->minrtt; - target_cwnd = (tp->snd_cwnd * veno->basertt); + target_cwnd = (u64)tp->snd_cwnd * veno->basertt; target_cwnd <<= V_PARAM_SHIFT; do_div(target_cwnd, rtt); @@ -175,7 +175,6 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) } else tp->snd_cwnd_cnt++; } - } if (tp->snd_cwnd < 2) tp->snd_cwnd = 2; diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index b94a04ae2ed5..bb63fba47d47 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -42,7 +42,6 @@ struct westwood { u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ }; - /* TCP Westwood functions and constants */ #define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ #define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ @@ -153,7 +152,6 @@ static inline void update_rtt_min(struct westwood *w) w->rtt_min = min(w->rtt, w->rtt_min); } - /* * @westwood_fast_bw * It is called when we are in fast path. In particular it is called when @@ -208,7 +206,6 @@ static inline u32 westwood_acked_count(struct sock *sk) return w->cumul_ack; } - /* * TCP Westwood * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it @@ -219,47 +216,51 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct westwood *w = inet_csk_ca(sk); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } +static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) +{ + if (ack_flags & CA_ACK_SLOWPATH) { + struct westwood *w = inet_csk_ca(sk); + + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); + + update_rtt_min(w); + return; + } + + westwood_fast_bw(sk); +} + static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); switch (event) { - case CA_EVENT_FAST_ACK: - westwood_fast_bw(sk); - break; - case CA_EVENT_COMPLETE_CWR: tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; - case CA_EVENT_LOSS: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); /* Update RTT_min when next ack arrives */ w->reset_rtt_min = 1; break; - - case CA_EVENT_SLOW_ACK: - westwood_update_window(sk); - w->bk += westwood_acked_count(sk); - update_rtt_min(w); - break; - default: /* don't care */ break; } } - /* Extract info for Tcp socket info provided via netlink. */ static void tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct westwood *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { struct tcpvegas_info info = { .tcpv_enabled = 1, @@ -271,12 +272,12 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, } } - static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .cwnd_event = tcp_westwood_event, + .in_ack_event = tcp_westwood_ack, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 599b79b8eac0..cd7273218598 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -54,10 +54,8 @@ static void tcp_yeah_init(struct sock *sk) /* Ensure the MD arithmetic works. This is somewhat pedantic, * since I don't think we will see a cwnd this large. :) */ tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); - } - static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -84,7 +82,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) /* Scalable */ tp->snd_cwnd_cnt += yeah->pkts_acked; - if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ + if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) { if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; @@ -120,7 +118,6 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) */ if (after(ack, yeah->vegas.beg_snd_nxt)) { - /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. @@ -189,7 +186,6 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) } yeah->lastQ = queue; - } /* Save the extent of the current window so we can use this @@ -205,7 +201,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) } } -static u32 tcp_yeah_ssthresh(struct sock *sk) { +static u32 tcp_yeah_ssthresh(struct sock *sk) +{ const struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); u32 reduction; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7d5a8661df76..cd0db5471bb5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -99,6 +99,7 @@ #include <linux/slab.h> #include <net/tcp_states.h> #include <linux/skbuff.h> +#include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/net_namespace.h> @@ -224,7 +225,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, remaining = (high - low) + 1; rand = prandom_u32(); - first = (((u64)rand * remaining) >> 32) + low; + first = reciprocal_scale(rand, remaining) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE */ @@ -448,7 +449,7 @@ begin: } } else if (score == badness && reuseport) { matches++; - if (((u64)hash * matches) >> 32 == 0) + if (reciprocal_scale(hash, matches) == 0) result = sk; hash = next_pseudo_random32(hash); } @@ -529,7 +530,7 @@ begin: } } else if (score == badness && reuseport) { matches++; - if (((u64)hash * matches) >> 32 == 0) + if (reciprocal_scale(hash, matches) == 0) result = sk; hash = next_pseudo_random32(hash); } @@ -594,27 +595,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, return true; } -static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, - __be16 loc_port, __be32 loc_addr, - __be16 rmt_port, __be32 rmt_addr, - int dif) -{ - struct hlist_nulls_node *node; - struct sock *s = sk; - unsigned short hnum = ntohs(loc_port); - - sk_nulls_for_each_from(s, node) { - if (__udp_is_mcast_sock(net, s, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum)) - goto found; - } - s = NULL; -found: - return s; -} - /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -1588,7 +1568,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto csum_error; - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; @@ -1640,6 +1620,8 @@ static void flush_stack(struct sock **stack, unsigned int count, if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) skb1 = NULL; + + sock_put(sk); } if (unlikely(skb1)) kfree_skb(skb1); @@ -1668,41 +1650,50 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udp_table *udptable) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; - struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); - int dif; - unsigned int i, count = 0; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(uh->dest); + struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); + int dif = skb->dev->ifindex; + unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + + if (use_hash2) { + hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & + udp_table.mask; + hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask; +start_lookup: + hslot = &udp_table.hash2[hash2]; + offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); + } spin_lock(&hslot->lock); - sk = sk_nulls_head(&hslot->head); - dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - while (sk) { - stack[count++] = sk; - sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, - daddr, uh->source, saddr, dif); - if (unlikely(count == ARRAY_SIZE(stack))) { - if (!sk) - break; - flush_stack(stack, count, skb, ~0); - count = 0; + sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { + if (__udp_is_mcast_sock(net, sk, + uh->dest, daddr, + uh->source, saddr, + dif, hnum)) { + if (unlikely(count == ARRAY_SIZE(stack))) { + flush_stack(stack, count, skb, ~0); + count = 0; + } + stack[count++] = sk; + sock_hold(sk); } } - /* - * before releasing chain lock, we must take a reference on sockets - */ - for (i = 0; i < count; i++) - sock_hold(stack[i]); spin_unlock(&hslot->lock); + /* Also lookup *:port if we are using hash2 and haven't done so yet. */ + if (use_hash2 && hash2 != hash2_any) { + hash2 = hash2_any; + goto start_lookup; + } + /* * do the slow work with no lock held */ if (count) { flush_stack(stack, count, skb, count - 1); - - for (i = 0; i < count; i++) - sock_put(stack[i]); } else { kfree_skb(skb); } @@ -1797,6 +1788,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (sk != NULL) { int ret; + if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_compute_pseudo); + ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); @@ -1977,7 +1972,7 @@ void udp_v4_early_demux(struct sk_buff *skb) return; skb->sk = sk; - skb->destructor = sock_edemux; + skb->destructor = sock_efree; dst = sk->sk_rx_dst; if (dst) @@ -2526,79 +2521,3 @@ void __init udp_init(void) sysctl_udp_rmem_min = SK_MEM_QUANTUM; sysctl_udp_wmem_min = SK_MEM_QUANTUM; } - -struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - u16 mac_offset = skb->mac_header; - int mac_len = skb->mac_len; - int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - __be16 protocol = skb->protocol; - netdev_features_t enc_features; - int udp_offset, outer_hlen; - unsigned int oldlen; - bool need_csum; - - oldlen = (u16)~skb->len; - - if (unlikely(!pskb_may_pull(skb, tnl_hlen))) - goto out; - - skb->encapsulation = 0; - __skb_pull(skb, tnl_hlen); - skb_reset_mac_header(skb); - skb_set_network_header(skb, skb_inner_network_offset(skb)); - skb->mac_len = skb_inner_network_offset(skb); - skb->protocol = htons(ETH_P_TEB); - - need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); - if (need_csum) - skb->encap_hdr_csum = 1; - - /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); - segs = skb_mac_gso_segment(skb, enc_features); - if (!segs || IS_ERR(segs)) { - skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, - mac_len); - goto out; - } - - outer_hlen = skb_tnl_header_len(skb); - udp_offset = outer_hlen - tnl_hlen; - skb = segs; - do { - struct udphdr *uh; - int len; - - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - - skb->mac_len = mac_len; - - skb_push(skb, outer_hlen); - skb_reset_mac_header(skb); - skb_set_network_header(skb, mac_len); - skb_set_transport_header(skb, udp_offset); - len = skb->len - udp_offset; - uh = udp_hdr(skb); - uh->len = htons(len); - - if (need_csum) { - __be32 delta = htonl(oldlen + len); - - uh->check = ~csum_fold((__force __wsum) - ((__force u32)uh->check + - (__force u32)delta)); - uh->check = gso_make_checksum(skb, ~uh->check); - - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - } - - skb->protocol = protocol; - } while ((skb = skb->next)); -out: - return segs; -} diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 546d2d439dda..507310ef4b56 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -25,26 +25,121 @@ struct udp_offload_priv { struct udp_offload_priv __rcu *next; }; -static int udp4_ufo_send_check(struct sk_buff *skb) +static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features, + struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, + netdev_features_t features), + __be16 new_protocol) { - if (!pskb_may_pull(skb, sizeof(struct udphdr))) - return -EINVAL; + struct sk_buff *segs = ERR_PTR(-EINVAL); + u16 mac_offset = skb->mac_header; + int mac_len = skb->mac_len; + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + __be16 protocol = skb->protocol; + netdev_features_t enc_features; + int udp_offset, outer_hlen; + unsigned int oldlen; + bool need_csum; + + oldlen = (u16)~skb->len; + + if (unlikely(!pskb_may_pull(skb, tnl_hlen))) + goto out; + + skb->encapsulation = 0; + __skb_pull(skb, tnl_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb_inner_network_offset(skb)); + skb->mac_len = skb_inner_network_offset(skb); + skb->protocol = new_protocol; + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); + if (need_csum) + skb->encap_hdr_csum = 1; + + /* segment inner packet. */ + enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + segs = gso_inner_segment(skb, enc_features); + if (IS_ERR_OR_NULL(segs)) { + skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, + mac_len); + goto out; + } - if (likely(!skb->encapsulation)) { - const struct iphdr *iph; + outer_hlen = skb_tnl_header_len(skb); + udp_offset = outer_hlen - tnl_hlen; + skb = segs; + do { struct udphdr *uh; + int len; - iph = ip_hdr(skb); + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + + skb->mac_len = mac_len; + + skb_push(skb, outer_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + skb_set_transport_header(skb, udp_offset); + len = skb->len - udp_offset; uh = udp_hdr(skb); + uh->len = htons(len); + + if (need_csum) { + __be32 delta = htonl(oldlen + len); + + uh->check = ~csum_fold((__force __wsum) + ((__force u32)uh->check + + (__force u32)delta)); + uh->check = gso_make_checksum(skb, ~uh->check); + + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } - uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, - IPPROTO_UDP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; + skb->protocol = protocol; + } while ((skb = skb->next)); +out: + return segs; +} + +struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, + netdev_features_t features, + bool is_ipv6) +{ + __be16 protocol = skb->protocol; + const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, + netdev_features_t features); + + rcu_read_lock(); + + switch (skb->inner_protocol_type) { + case ENCAP_TYPE_ETHER: + protocol = skb->inner_protocol; + gso_inner_segment = skb_mac_gso_segment; + break; + case ENCAP_TYPE_IPPROTO: + offloads = is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[skb->inner_ipproto]); + if (!ops || !ops->callbacks.gso_segment) + goto out_unlock; + gso_inner_segment = ops->callbacks.gso_segment; + break; + default: + goto out_unlock; } - return 0; + segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment, + protocol); + +out_unlock: + rcu_read_unlock(); + + return segs; } static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, @@ -52,16 +147,20 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; - int offset; __wsum csum; + struct udphdr *uh; + struct iphdr *iph; if (skb->encapsulation && (skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { - segs = skb_udp_tunnel_segment(skb, features); + segs = skb_udp_tunnel_segment(skb, features, false); goto out; } + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; @@ -89,10 +188,16 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, * HW cannot do checksum of UDP packets sent as multiple * IP fragments. */ - offset = skb_checksum_start_offset(skb); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); + + uh = udp_hdr(skb); + iph = ip_hdr(skb); + + uh->check = 0; + csum = skb_checksum(skb, 0, skb->len, 0); + uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_NONE; /* Fragment the skb. IP headers of the fragments are updated in @@ -152,30 +257,24 @@ unlock: } EXPORT_SYMBOL(udp_del_offload); -static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, + struct udphdr *uh) { struct udp_offload_priv *uo_priv; struct sk_buff *p, **pp = NULL; - struct udphdr *uh, *uh2; - unsigned int hlen, off; + struct udphdr *uh2; + unsigned int off = skb_gro_offset(skb); int flush = 1; if (NAPI_GRO_CB(skb)->udp_mark || - (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) + (skb->ip_summed != CHECKSUM_PARTIAL && + NAPI_GRO_CB(skb)->csum_cnt == 0 && + !NAPI_GRO_CB(skb)->csum_valid)) goto out; /* mark that this skb passed once through the udp gro layer */ NAPI_GRO_CB(skb)->udp_mark = 1; - off = skb_gro_offset(skb); - hlen = off + sizeof(*uh); - uh = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - uh = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!uh)) - goto out; - } - rcu_read_lock(); uo_priv = rcu_dereference(udp_offload_base); for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { @@ -193,7 +292,12 @@ unflush: continue; uh2 = (struct udphdr *)(p->data + off); - if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { + + /* Match ports and either checksums are either both zero + * or nonzero. + */ + if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) || + (!uh->check ^ !uh2->check)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } @@ -201,6 +305,7 @@ unflush: skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); + NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; pp = uo_priv->offload->callbacks.gro_receive(head, skb); out_unlock: @@ -210,7 +315,34 @@ out: return pp; } -static int udp_gro_complete(struct sk_buff *skb, int nhoff) +static struct sk_buff **udp4_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct udphdr *uh = udp_gro_udphdr(skb); + + if (unlikely(!uh)) + goto flush; + + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (NAPI_GRO_CB(skb)->flush) + goto skip; + + if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + inet_gro_compute_pseudo)) + goto flush; + else if (uh->check) + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_gro_compute_pseudo); +skip: + NAPI_GRO_CB(skb)->is_ipv6 = 0; + return udp_gro_receive(head, skb, uh); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; +} + +int udp_gro_complete(struct sk_buff *skb, int nhoff) { struct udp_offload_priv *uo_priv; __be16 newlen = htons(skb->len - nhoff); @@ -228,19 +360,32 @@ static int udp_gro_complete(struct sk_buff *skb, int nhoff) break; } - if (uo_priv != NULL) + if (uo_priv != NULL) { + NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + } rcu_read_unlock(); return err; } +static int udp4_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct iphdr *iph = ip_hdr(skb); + struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + + if (uh->check) + uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, + iph->daddr, 0); + + return udp_gro_complete(skb, nhoff); +} + static const struct net_offload udpv4_offload = { .callbacks = { - .gso_send_check = udp4_ufo_send_check, .gso_segment = udp4_ufo_fragment, - .gro_receive = udp_gro_receive, - .gro_complete = udp_gro_complete, + .gro_receive = udp4_gro_receive, + .gro_complete = udp4_gro_complete, }, }; diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c new file mode 100644 index 000000000000..1671263e5fa0 --- /dev/null +++ b/net/ipv4/udp_tunnel.c @@ -0,0 +1,108 @@ +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/socket.h> +#include <linux/udp.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <net/udp.h> +#include <net/udp_tunnel.h> +#include <net/net_namespace.h> + +int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) +{ + int err; + struct socket *sock = NULL; + struct sockaddr_in udp_addr; + + err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); + if (err < 0) + goto error; + + sk_change_net(sock->sk, net); + + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->local_ip; + udp_addr.sin_port = cfg->local_udp_port; + err = kernel_bind(sock, (struct sockaddr *)&udp_addr, + sizeof(udp_addr)); + if (err < 0) + goto error; + + if (cfg->peer_udp_port) { + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->peer_ip; + udp_addr.sin_port = cfg->peer_udp_port; + err = kernel_connect(sock, (struct sockaddr *)&udp_addr, + sizeof(udp_addr), 0); + if (err < 0) + goto error; + } + + sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; + + *sockp = sock; + return 0; + +error: + if (sock) { + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); + } + *sockp = NULL; + return err; +} +EXPORT_SYMBOL(udp_sock_create4); + +void setup_udp_tunnel_sock(struct net *net, struct socket *sock, + struct udp_tunnel_sock_cfg *cfg) +{ + struct sock *sk = sock->sk; + + /* Disable multicast loopback */ + inet_sk(sk)->mc_loop = 0; + + /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ + udp_set_convert_csum(sk, true); + + rcu_assign_sk_user_data(sk, cfg->sk_user_data); + + udp_sk(sk)->encap_type = cfg->encap_type; + udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_destroy = cfg->encap_destroy; + + udp_tunnel_encap_enable(sock); +} +EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); + +int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, + __u8 tos, __u8 ttl, __be16 df, __be16 src_port, + __be16 dst_port, bool xnet) +{ + struct udphdr *uh; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + uh->len = htons(skb->len); + + udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len); + + return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP, + tos, ttl, df, xnet); +} +EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); + +void udp_tunnel_sock_release(struct socket *sock) +{ + rcu_assign_sk_user_data(sock->sk, NULL); + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); +} +EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index a2ce0101eaac..dccefa9d84cf 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -124,7 +124,7 @@ static int xfrm4_ah_rcv(struct sk_buff *skb) for_each_protocol_rcu(ah4_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) - return ret;; + return ret; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 2fe68364bb20..2e8c06108ab9 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -45,3 +45,7 @@ obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o + +ifneq ($(CONFIG_IPV6),) +obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o +endif diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5667b3003af9..725c763270a0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -108,11 +108,12 @@ static inline u32 cstamp_delta(unsigned long cstamp) } #ifdef CONFIG_SYSCTL -static void addrconf_sysctl_register(struct inet6_dev *idev); +static int addrconf_sysctl_register(struct inet6_dev *idev); static void addrconf_sysctl_unregister(struct inet6_dev *idev); #else -static inline void addrconf_sysctl_register(struct inet6_dev *idev) +static inline int addrconf_sysctl_register(struct inet6_dev *idev) { + return 0; } static inline void addrconf_sysctl_unregister(struct inet6_dev *idev) @@ -179,13 +180,14 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, - .use_tempaddr = 0, + .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, .regen_max_retry = REGEN_MAX_RETRY, .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .accept_ra_from_local = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -222,6 +224,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .accept_ra_from_local = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -308,16 +311,16 @@ err_ip: static struct inet6_dev *ipv6_add_dev(struct net_device *dev) { struct inet6_dev *ndev; + int err = -ENOMEM; ASSERT_RTNL(); if (dev->mtu < IPV6_MIN_MTU) - return NULL; + return ERR_PTR(-EINVAL); ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL); - if (ndev == NULL) - return NULL; + return ERR_PTR(err); rwlock_init(&ndev->lock); ndev->dev = dev; @@ -330,7 +333,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); if (ndev->nd_parms == NULL) { kfree(ndev); - return NULL; + return ERR_PTR(err); } if (ndev->cnf.forwarding) dev_disable_lro(dev); @@ -344,17 +347,14 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) neigh_parms_release(&nd_tbl, ndev->nd_parms); dev_put(dev); kfree(ndev); - return NULL; + return ERR_PTR(err); } if (snmp6_register_dev(ndev) < 0) { ADBG(KERN_WARNING "%s: cannot create /proc/net/dev_snmp6/%s\n", __func__, dev->name); - neigh_parms_release(&nd_tbl, ndev->nd_parms); - ndev->dead = 1; - in6_dev_finish_destroy(ndev); - return NULL; + goto err_release; } /* One reference from device. We must do this before @@ -392,7 +392,12 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ipv6_mc_init_dev(ndev); ndev->tstamp = jiffies; - addrconf_sysctl_register(ndev); + err = addrconf_sysctl_register(ndev); + if (err) { + ipv6_mc_destroy_dev(ndev); + del_timer(&ndev->regen_timer); + goto err_release; + } /* protected by rtnl_lock */ rcu_assign_pointer(dev->ip6_ptr, ndev); @@ -407,6 +412,12 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); return ndev; + +err_release: + neigh_parms_release(&nd_tbl, ndev->nd_parms); + ndev->dead = 1; + in6_dev_finish_destroy(ndev); + return ERR_PTR(err); } static struct inet6_dev *ipv6_find_idev(struct net_device *dev) @@ -418,7 +429,7 @@ static struct inet6_dev *ipv6_find_idev(struct net_device *dev) idev = __in6_dev_get(dev); if (!idev) { idev = ipv6_add_dev(dev); - if (!idev) + if (IS_ERR(idev)) return NULL; } @@ -1094,8 +1105,8 @@ retry: spin_unlock_bh(&ifp->lock); regen_advance = idev->cnf.regen_max_retry * - idev->cnf.dad_transmits * - NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; + idev->cnf.dad_transmits * + NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; write_unlock_bh(&idev->lock); /* A temporary address is created only if this calculated Preferred @@ -1679,14 +1690,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) addrconf_mod_dad_work(ifp, 0); } -/* Join to solicited addr multicast group. */ - +/* Join to solicited addr multicast group. + * caller must hold RTNL */ void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) { struct in6_addr maddr; - ASSERT_RTNL(); - if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1694,12 +1703,11 @@ void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) ipv6_dev_mc_inc(dev, &maddr); } +/* caller must hold RTNL */ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) { struct in6_addr maddr; - ASSERT_RTNL(); - if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1707,26 +1715,24 @@ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) __ipv6_dev_mc_dec(idev, &maddr); } +/* caller must hold RTNL */ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; - ASSERT_RTNL(); - if (ifp->prefix_len >= 127) /* RFC 6164 */ return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); if (ipv6_addr_any(&addr)) return; - ipv6_dev_ac_inc(ifp->idev->dev, &addr); + __ipv6_dev_ac_inc(ifp->idev, &addr); } +/* caller must hold RTNL */ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; - ASSERT_RTNL(); - if (ifp->prefix_len >= 127) /* RFC 6164 */ return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); @@ -2728,9 +2734,25 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr } } +static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) +{ + if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + /* addrconf_add_linklocal also adds a prefix_route and we + * only need to care about prefix routes if ipv6_generate_eui64 + * couldn't generate one. + */ + if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) + addrconf_add_linklocal(idev, &addr); + else if (prefix_route) + addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + } +} + static void addrconf_dev_config(struct net_device *dev) { - struct in6_addr addr; struct inet6_dev *idev; ASSERT_RTNL(); @@ -2751,11 +2773,7 @@ static void addrconf_dev_config(struct net_device *dev) if (IS_ERR(idev)) return; - memset(&addr, 0, sizeof(struct in6_addr)); - addr.s6_addr32[0] = htonl(0xFE800000); - - if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0) - addrconf_add_linklocal(idev, &addr); + addrconf_addr_gen(idev, false); } #if IS_ENABLED(CONFIG_IPV6_SIT) @@ -2777,11 +2795,7 @@ static void addrconf_sit_config(struct net_device *dev) } if (dev->priv_flags & IFF_ISATAP) { - struct in6_addr addr; - - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) - addrconf_add_linklocal(idev, &addr); + addrconf_addr_gen(idev, false); return; } @@ -2796,7 +2810,6 @@ static void addrconf_sit_config(struct net_device *dev) static void addrconf_gre_config(struct net_device *dev) { struct inet6_dev *idev; - struct in6_addr addr; ASSERT_RTNL(); @@ -2805,11 +2818,7 @@ static void addrconf_gre_config(struct net_device *dev) return; } - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) - addrconf_add_linklocal(idev, &addr); - else - addrconf_prefix_route(&addr, 64, dev, 0, 0); + addrconf_addr_gen(idev, true); } #endif @@ -2825,8 +2834,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, case NETDEV_REGISTER: if (!idev && dev->mtu >= IPV6_MIN_MTU) { idev = ipv6_add_dev(dev); - if (!idev) - return notifier_from_errno(-ENOMEM); + if (IS_ERR(idev)) + return notifier_from_errno(PTR_ERR(idev)); } break; @@ -2835,6 +2844,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (dev->flags & IFF_SLAVE) break; + if (idev && idev->cnf.disable_ipv6) + break; + if (event == NETDEV_UP) { if (!addrconf_qdisc_ok(dev)) { /* device is not ready yet. */ @@ -2846,7 +2858,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (!idev && dev->mtu >= IPV6_MIN_MTU) idev = ipv6_add_dev(dev); - if (idev) { + if (!IS_ERR_OR_NULL(idev)) { idev->if_flags |= IF_READY; run_pending = 1; } @@ -2889,7 +2901,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; } - if (idev) { + if (!IS_ERR_OR_NULL(idev)) { if (run_pending) addrconf_dad_run(idev); @@ -2924,7 +2936,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (!idev && dev->mtu >= IPV6_MIN_MTU) { idev = ipv6_add_dev(dev); - if (idev) + if (!IS_ERR(idev)) break; } @@ -2945,10 +2957,14 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (idev) { snmp6_unregister_dev(idev); addrconf_sysctl_unregister(idev); - addrconf_sysctl_register(idev); - err = snmp6_register_dev(idev); + err = addrconf_sysctl_register(idev); if (err) return notifier_from_errno(err); + err = snmp6_register_dev(idev); + if (err) { + addrconf_sysctl_unregister(idev); + return notifier_from_errno(err); + } } break; @@ -3017,7 +3033,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) struct hlist_head *h = &inet6_addr_lst[i]; spin_lock_bh(&addrconf_hash_lock); - restart: +restart: hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { hlist_del_init_rcu(&ifa->addr_lst); @@ -3081,11 +3097,13 @@ static int addrconf_ifdown(struct net_device *dev, int how) write_unlock_bh(&idev->lock); - /* Step 5: Discard multicast list */ - if (how) + /* Step 5: Discard anycast and multicast list */ + if (how) { + ipv6_ac_destroy_dev(idev); ipv6_mc_destroy_dev(idev); - else + } else { ipv6_mc_down(idev); + } idev->tstamp = jiffies; @@ -3529,8 +3547,8 @@ static void __net_exit if6_proc_net_exit(struct net *net) } static struct pernet_operations if6_proc_net_ops = { - .init = if6_proc_net_init, - .exit = if6_proc_net_exit, + .init = if6_proc_net_init, + .exit = if6_proc_net_exit, }; int __init if6_proc_init(void) @@ -4321,6 +4339,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify; array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; + array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; } static inline size_t inet6_ifla6_size(void) @@ -4420,6 +4439,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (nla == NULL) goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode)) + goto nla_put_failure; + read_lock_bh(&idev->lock); memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); read_unlock_bh(&idev->lock); @@ -4524,8 +4547,21 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL) < 0) BUG(); - if (tb[IFLA_INET6_TOKEN]) + if (tb[IFLA_INET6_TOKEN]) { err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN])); + if (err) + return err; + } + + if (tb[IFLA_INET6_ADDR_GEN_MODE]) { + u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); + + if (mode != IN6_ADDR_GEN_MODE_EUI64 && + mode != IN6_ADDR_GEN_MODE_NONE) + return -EINVAL; + idev->addr_gen_mode = mode; + err = 0; + } return err; } @@ -4737,24 +4773,21 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) addrconf_leave_solict(ifp->idev, &ifp->addr); if (!ipv6_addr_any(&ifp->peer_addr)) { struct rt6_info *rt; - struct net_device *dev = ifp->idev->dev; - - rt = rt6_lookup(dev_net(dev), &ifp->peer_addr, NULL, - dev->ifindex, 1); - if (rt) { - dst_hold(&rt->dst); - if (ip6_del_rt(rt)) - dst_free(&rt->dst); - } + + rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, + ifp->idev->dev, 0, 0); + if (rt && ip6_del_rt(rt)) + dst_free(&rt->dst); } dst_hold(&ifp->rt->dst); if (ip6_del_rt(ifp->rt)) dst_free(&ifp->rt->dst); + + rt_genid_bump_ipv6(net); break; } atomic_inc(&net->ipv6.dev_addr_genid); - rt_genid_bump_ipv6(net); } static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) @@ -5168,6 +5201,13 @@ static struct addrconf_sysctl_table .proc_handler = proc_dointvec }, { + .procname = "accept_ra_from_local", + .data = &ipv6_devconf.accept_ra_from_local, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { /* sentinel */ } }, @@ -5218,12 +5258,23 @@ static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) kfree(t); } -static void addrconf_sysctl_register(struct inet6_dev *idev) +static int addrconf_sysctl_register(struct inet6_dev *idev) { - neigh_sysctl_register(idev->dev, idev->nd_parms, - &ndisc_ifinfo_sysctl_change); - __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name, - idev, &idev->cnf); + int err; + + if (!sysctl_dev_name_is_allowed(idev->dev->name)) + return -EINVAL; + + err = neigh_sysctl_register(idev->dev, idev->nd_parms, + &ndisc_ifinfo_sysctl_change); + if (err) + return err; + err = __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name, + idev, &idev->cnf); + if (err) + neigh_sysctl_unregister(idev->nd_parms); + + return err; } static void addrconf_sysctl_unregister(struct inet6_dev *idev) @@ -5308,6 +5359,7 @@ static struct rtnl_af_ops inet6_ops = { int __init addrconf_init(void) { + struct inet6_dev *idev; int i, err; err = ipv6_addr_label_init(); @@ -5346,11 +5398,12 @@ int __init addrconf_init(void) * device and it being up should be removed. */ rtnl_lock(); - if (!ipv6_add_dev(init_net.loopback_dev)) - err = -ENOMEM; + idev = ipv6_add_dev(init_net.loopback_dev); rtnl_unlock(); - if (err) + if (IS_ERR(idev)) { + err = PTR_ERR(idev); goto errlo; + } for (i = 0; i < IN6_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&inet6_addr_lst[i]); diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index e6960457f625..98cc4cd570e2 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -8,6 +8,13 @@ #include <net/addrconf.h> #include <net/ip.h> +/* if ipv6 module registers this function is used by xfrm to force all + * sockets to relookup their nodes - this is fairly expensive, be + * careful + */ +void (*__fib6_flush_trees)(struct net *); +EXPORT_SYMBOL(__fib6_flush_trees); + #define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) static inline unsigned int ipv6_addr_scope2type(unsigned int scope) diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 731e1e1722d9..fd0dc47f471d 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -277,7 +277,7 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) last = p; } if (last) - hlist_add_after_rcu(&last->list, &newp->list); + hlist_add_behind_rcu(&newp->list, &last->list); else hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); out: diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 7cb4392690dd..e8c4400f23e9 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,15 +7,15 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * Fixes: + * Fixes: * piggy, Karl Knutson : Socket protocol table - * Hideaki YOSHIFUJI : sin6_scope_id support - * Arnaldo Melo : check proc_net_create return, cleanups + * Hideaki YOSHIFUJI : sin6_scope_id support + * Arnaldo Melo : check proc_net_create return, cleanups * * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "IPv6: " fmt @@ -197,7 +197,7 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->ipv6only = net->ipv6.sysctl.bindv6only; + sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. @@ -294,7 +294,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Binding to v4-mapped address on a v6-only socket * makes no sense */ - if (np->ipv6only) { + if (sk->sk_ipv6only) { err = -EINVAL; goto out; } @@ -302,7 +302,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type(net, v4addr); - if (!sysctl_ip_nonlocal_bind && + if (!net->ipv4.sysctl_ip_nonlocal_bind && !(inet->freebind || inet->transparent) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && @@ -371,7 +371,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_type != IPV6_ADDR_ANY) { sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (addr_type != IPV6_ADDR_MAPPED) - np->ipv6only = 1; + sk->sk_ipv6only = 1; } if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; @@ -672,10 +672,10 @@ int inet6_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); -bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb) +bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, + const struct inet6_skb_parm *opt) { const struct ipv6_pinfo *np = inet6_sk(sk); - const struct inet6_skb_parm *opt = IP6CB(skb); if (np->rxopt.all) { if ((opt->hop && (np->rxopt.bits.hopopts || @@ -765,7 +765,8 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.flowlabel_consistency = 1; - atomic_set(&net->ipv6.rt_genid, 0); + net->ipv6.sysctl.auto_flowlabels = 0; + atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); if (err) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 72a4930bdc0a..6d16eb0e0c7f 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -17,10 +17,10 @@ * Authors * * Mitsuru KANDA @USAGI : IPv6 Support - * Kazunori MIYAZAWA @USAGI : - * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * - * This file is derived from net/ipv4/ah.c. + * This file is derived from net/ipv4/ah.c. */ #define pr_fmt(fmt) "IPv6: " fmt @@ -284,7 +284,7 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) ipv6_rearrange_rthdr(iph, exthdr.rth); break; - default : + default: return 0; } @@ -478,7 +478,7 @@ static void ah6_input_done(struct crypto_async_request *base, int err) auth_data = ah_tmp_auth(work_iph, hdr_len); icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); - err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out; @@ -622,7 +622,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) goto out_free; } - err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out_free; @@ -647,8 +647,8 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); - struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; - struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset); + struct ipv6hdr *iph = (struct ipv6hdr *)skb->data; + struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+offset); struct xfrm_state *x; if (type != ICMPV6_PKT_TOOBIG && @@ -713,8 +713,6 @@ static int ah6_init_state(struct xfrm_state *x) ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; - BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); switch (x->props.mode) { @@ -755,11 +753,10 @@ static int ah6_rcv_cb(struct sk_buff *skb, int err) return 0; } -static const struct xfrm_type ah6_type = -{ +static const struct xfrm_type ah6_type = { .description = "AH6", .owner = THIS_MODULE, - .proto = IPPROTO_AH, + .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, .init_state = ah6_init_state, .destructor = ah6_destroy, diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 210183244689..f5e319a8d4e2 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -46,10 +46,6 @@ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); -/* Big ac list lock for all the sockets */ -static DEFINE_SPINLOCK(ipv6_sk_ac_lock); - - /* * socket join an anycast group */ @@ -77,7 +73,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) pac->acl_next = NULL; pac->acl_addr = *addr; - rcu_read_lock(); + rtnl_lock(); if (ifindex == 0) { struct rt6_info *rt; @@ -90,11 +86,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) goto error; } else { /* router, no matching interface: just pick one */ - dev = dev_get_by_flags_rcu(net, IFF_UP, - IFF_UP | IFF_LOOPBACK); + dev = __dev_get_by_flags(net, IFF_UP, + IFF_UP | IFF_LOOPBACK); } } else - dev = dev_get_by_index_rcu(net, ifindex); + dev = __dev_get_by_index(net, ifindex); if (dev == NULL) { err = -ENODEV; @@ -126,17 +122,15 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) goto error; } - err = ipv6_dev_ac_inc(dev, addr); + err = __ipv6_dev_ac_inc(idev, addr); if (!err) { - spin_lock_bh(&ipv6_sk_ac_lock); pac->acl_next = np->ipv6_ac_list; np->ipv6_ac_list = pac; - spin_unlock_bh(&ipv6_sk_ac_lock); pac = NULL; } error: - rcu_read_unlock(); + rtnl_unlock(); if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); return err; @@ -152,7 +146,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) struct ipv6_ac_socklist *pac, *prev_pac; struct net *net = sock_net(sk); - spin_lock_bh(&ipv6_sk_ac_lock); + rtnl_lock(); prev_pac = NULL; for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { if ((ifindex == 0 || pac->acl_ifindex == ifindex) && @@ -161,7 +155,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) prev_pac = pac; } if (!pac) { - spin_unlock_bh(&ipv6_sk_ac_lock); + rtnl_unlock(); return -ENOENT; } if (prev_pac) @@ -169,13 +163,10 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) else np->ipv6_ac_list = pac->acl_next; - spin_unlock_bh(&ipv6_sk_ac_lock); - - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + dev = __dev_get_by_index(net, pac->acl_ifindex); if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); - rcu_read_unlock(); + rtnl_unlock(); sock_kfree_s(sk, pac, sizeof(*pac)); return 0; @@ -192,18 +183,16 @@ void ipv6_sock_ac_close(struct sock *sk) if (!np->ipv6_ac_list) return; - spin_lock_bh(&ipv6_sk_ac_lock); + rtnl_lock(); pac = np->ipv6_ac_list; np->ipv6_ac_list = NULL; - spin_unlock_bh(&ipv6_sk_ac_lock); prev_index = 0; - rcu_read_lock(); while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { - dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + dev = __dev_get_by_index(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) @@ -211,7 +200,12 @@ void ipv6_sock_ac_close(struct sock *sk) sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } - rcu_read_unlock(); + rtnl_unlock(); +} + +static void aca_get(struct ifacaddr6 *aca) +{ + atomic_inc(&aca->aca_refcnt); } static void aca_put(struct ifacaddr6 *ac) @@ -223,20 +217,39 @@ static void aca_put(struct ifacaddr6 *ac) } } +static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, + const struct in6_addr *addr) +{ + struct inet6_dev *idev = rt->rt6i_idev; + struct ifacaddr6 *aca; + + aca = kzalloc(sizeof(*aca), GFP_ATOMIC); + if (aca == NULL) + return NULL; + + aca->aca_addr = *addr; + in6_dev_hold(idev); + aca->aca_idev = idev; + aca->aca_rt = rt; + aca->aca_users = 1; + /* aca_tstamp should be updated upon changes */ + aca->aca_cstamp = aca->aca_tstamp = jiffies; + atomic_set(&aca->aca_refcnt, 1); + spin_lock_init(&aca->aca_lock); + + return aca; +} + /* * device anycast group inc (add if not found) */ -int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) +int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; - struct inet6_dev *idev; struct rt6_info *rt; int err; - idev = in6_dev_get(dev); - - if (idev == NULL) - return -EINVAL; + ASSERT_RTNL(); write_lock_bh(&idev->lock); if (idev->dead) { @@ -252,46 +265,35 @@ int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) } } - /* - * not found: create a new one. - */ - - aca = kzalloc(sizeof(struct ifacaddr6), GFP_ATOMIC); - - if (aca == NULL) { - err = -ENOMEM; - goto out; - } - rt = addrconf_dst_alloc(idev, addr, true); if (IS_ERR(rt)) { - kfree(aca); err = PTR_ERR(rt); goto out; } - - aca->aca_addr = *addr; - aca->aca_idev = idev; - aca->aca_rt = rt; - aca->aca_users = 1; - /* aca_tstamp should be updated upon changes */ - aca->aca_cstamp = aca->aca_tstamp = jiffies; - atomic_set(&aca->aca_refcnt, 2); - spin_lock_init(&aca->aca_lock); + aca = aca_alloc(rt, addr); + if (aca == NULL) { + ip6_rt_put(rt); + err = -ENOMEM; + goto out; + } aca->aca_next = idev->ac_list; idev->ac_list = aca; + + /* Hold this for addrconf_join_solict() below before we unlock, + * it is already exposed via idev->ac_list. + */ + aca_get(aca); write_unlock_bh(&idev->lock); ip6_ins_rt(rt); - addrconf_join_solict(dev, &aca->aca_addr); + addrconf_join_solict(idev->dev, &aca->aca_addr); aca_put(aca); return 0; out: write_unlock_bh(&idev->lock); - in6_dev_put(idev); return err; } @@ -302,6 +304,8 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca, *prev_aca; + ASSERT_RTNL(); + write_lock_bh(&idev->lock); prev_aca = NULL; for (aca = idev->ac_list; aca; aca = aca->aca_next) { @@ -331,7 +335,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) return 0; } -/* called with rcu_read_lock() */ +/* called with rtnl_lock() */ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev = __in6_dev_get(dev); @@ -341,6 +345,27 @@ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) return __ipv6_dev_ac_dec(idev, addr); } +void ipv6_ac_destroy_dev(struct inet6_dev *idev) +{ + struct ifacaddr6 *aca; + + write_lock_bh(&idev->lock); + while ((aca = idev->ac_list) != NULL) { + idev->ac_list = aca->aca_next; + write_unlock_bh(&idev->lock); + + addrconf_leave_solict(idev, &aca->aca_addr); + + dst_hold(&aca->aca_rt->dst); + ip6_del_rt(aca->aca_rt); + + aca_put(aca); + + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); +} + /* * check if the interface has this anycast address * called with rcu_read_lock() diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c3bf2d2e519e..2cdc38338be3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -43,13 +43,13 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a) int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *daddr, *final_p, final; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *daddr, *final_p, final; struct dst_entry *dst; struct flowi6 fl6; struct ip6_flowlabel *flowlabel = NULL; - struct ipv6_txoptions *opt; + struct ipv6_txoptions *opt; int addr_type; int err; @@ -199,6 +199,7 @@ ipv4_connected: NULL); sk->sk_state = TCP_ESTABLISHED; + ip6_set_txhash(sk); out: fl6_sock_release(flowlabel); return err; @@ -331,7 +332,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; - struct sk_buff *skb, *skb2; + struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); struct { struct sock_extended_err ee; @@ -341,7 +342,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int copied; err = -EAGAIN; - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (skb == NULL) goto out; @@ -414,17 +415,6 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_ERRQUEUE; err = copied; - /* Reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { - sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else { - spin_unlock_bh(&sk->sk_error_queue.lock); - } - out_free_skb: kfree_skb(skb); out: diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index d15da1377149..83fc3a385a26 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -17,10 +17,10 @@ * Authors * * Mitsuru KANDA @USAGI : IPv6 Support - * Kazunori MIYAZAWA @USAGI : - * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * - * This file is derived from net/ipv4/esp.c + * This file is derived from net/ipv4/esp.c */ #define pr_fmt(fmt) "IPv6: " fmt @@ -598,7 +598,7 @@ static int esp6_init_state(struct xfrm_state *x) case XFRM_MODE_BEET: if (x->sel.family != AF_INET6) x->props.header_len += IPV4_BEET_PHMAXLEN + - (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); break; case XFRM_MODE_TRANSPORT: break; @@ -621,11 +621,10 @@ static int esp6_rcv_cb(struct sk_buff *skb, int err) return 0; } -static const struct xfrm_type esp6_type = -{ +static const struct xfrm_type esp6_type = { .description = "ESP6", - .owner = THIS_MODULE, - .proto = IPPROTO_ESP, + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, .flags = XFRM_TYPE_REPLAY_PROT, .init_state = esp6_init_state, .destructor = esp6_destroy, diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 8d67900aa003..bfde361b6134 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -142,7 +142,7 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) default: /* Other TLV code so scan list */ if (optlen > len) goto bad; - for (curr=procs; curr->type >= 0; curr++) { + for (curr = procs; curr->type >= 0; curr++) { if (curr->type == nh[off]) { /* type specific length/alignment checks will be performed in the diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index f6c84a6eb238..97ae70077a4f 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -170,11 +170,11 @@ static bool is_ineligible(const struct sk_buff *skb) /* * Check the ICMP output rate limit */ -static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, - struct flowi6 *fl6) +static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, + struct flowi6 *fl6) { - struct dst_entry *dst; struct net *net = sock_net(sk); + struct dst_entry *dst; bool res = false; /* Informational messages are not limited. */ @@ -199,16 +199,20 @@ static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else { struct rt6_info *rt = (struct rt6_info *)dst; int tmo = net->ipv6.sysctl.icmpv6_time; - struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); - res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); + if (icmp_global_allow()) { + struct inet_peer *peer; + + peer = inet_getpeer_v6(net->ipv6.peers, + &rt->rt6i_dst.addr, 1); + res = inet_peer_xrlim_allow(peer, tmo); + if (peer) + inet_putpeer(peer); + } } dst_release(dst); return res; @@ -503,7 +507,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) msg.type = type; len = skb->len - msg.offset; - len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr)); + len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)); if (len < 0) { LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n"); goto out_dst_release; @@ -626,24 +630,25 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) int inner_offset; __be16 frag_off; u8 nexthdr; + struct net *net = dev_net(skb->dev); if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - return; + goto out; nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; if (ipv6_ext_hdr(nexthdr)) { /* now skip over extension headers */ inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if (inner_offset<0) - return; + if (inner_offset < 0) + goto out; } else { inner_offset = sizeof(struct ipv6hdr); } /* Checkin header including 8 bytes of inner protocol header. */ if (!pskb_may_pull(skb, inner_offset+8)) - return; + goto out; /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. Without this we will not able f.e. to make source routed @@ -652,13 +657,15 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) --ANK (980726) */ - rcu_read_lock(); ipprot = rcu_dereference(inet6_protos[nexthdr]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, NULL, type, code, inner_offset, info); - rcu_read_unlock(); raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info); + return; + +out: + ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); } /* @@ -770,12 +777,12 @@ static int icmpv6_rcv(struct sk_buff *skb) break; default: - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n"); - /* informational */ if (type & ICMPV6_INFOMSG_MASK) break; + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n"); + /* * error of unknown type. * must pass to upper level @@ -805,7 +812,7 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, memset(fl6, 0, sizeof(*fl6)); fl6->saddr = *saddr; fl6->daddr = *daddr; - fl6->flowi6_proto = IPPROTO_ICMPV6; + fl6->flowi6_proto = IPPROTO_ICMPV6; fl6->fl6_icmp_type = type; fl6->fl6_icmp_code = 0; fl6->flowi6_oif = oif; @@ -872,8 +879,8 @@ static void __net_exit icmpv6_sk_exit(struct net *net) } static struct pernet_operations icmpv6_sk_ops = { - .init = icmpv6_sk_init, - .exit = icmpv6_sk_exit, + .init = icmpv6_sk_init, + .exit = icmpv6_sk_exit, }; int __init icmpv6_init(void) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index a245e5ddffbd..29b32206e494 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -63,7 +63,6 @@ int inet6_csk_bind_conflict(const struct sock *sk, return sk2 != NULL; } - EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); struct dst_entry *inet6_csk_route_req(struct sock *sk, @@ -144,7 +143,6 @@ struct request_sock *inet6_csk_search_req(const struct sock *sk, return NULL; } - EXPORT_SYMBOL_GPL(inet6_csk_search_req); void inet6_csk_reqsk_queue_hash_add(struct sock *sk, @@ -160,10 +158,9 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk, reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); inet_csk_reqsk_queue_added(sk, timeout); } - EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); -void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; @@ -175,7 +172,6 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, sk->sk_bound_dev_if); } - EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); static inline diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 262e13c02ec2..051dffb49c90 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -6,7 +6,7 @@ * Generic INET6 transport hashtables * * Authors: Lotsa people, from code originally in tcp, generalised here - * by Arnaldo Carvalho de Melo <acme@mandriva.com> + * by Arnaldo Carvalho de Melo <acme@mandriva.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -198,7 +198,7 @@ begin: } } else if (score == hiscore && reuseport) { matches++; - if (((u64)phash * matches) >> 32 == 0) + if (reciprocal_scale(phash, matches) == 0) result = sk; phash = next_pseudo_random32(phash); } @@ -222,7 +222,6 @@ begin: rcu_read_unlock(); return result; } - EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, @@ -238,7 +237,6 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, return sk; } - EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, @@ -324,5 +322,4 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk), __inet6_check_established, __inet6_hash); } - EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index cb4459bd1d29..b2d1838897c9 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -46,20 +46,11 @@ static struct kmem_cache *fib6_node_kmem __read_mostly; -enum fib_walk_state_t { -#ifdef CONFIG_IPV6_SUBTREES - FWS_S, -#endif - FWS_L, - FWS_R, - FWS_C, - FWS_U -}; - -struct fib6_cleaner_t { - struct fib6_walker_t w; +struct fib6_cleaner { + struct fib6_walker w; struct net *net; int (*func)(struct rt6_info *, void *arg); + int sernum; void *arg; }; @@ -74,8 +65,8 @@ static DEFINE_RWLOCK(fib6_walker_lock); static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); -static int fib6_walk(struct fib6_walker_t *w); -static int fib6_walk_continue(struct fib6_walker_t *w); +static int fib6_walk(struct fib6_walker *w); +static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the @@ -84,34 +75,41 @@ static int fib6_walk_continue(struct fib6_walker_t *w); * result of redirects, path MTU changes, etc. */ -static __u32 rt_sernum; - static void fib6_gc_timer_cb(unsigned long arg); static LIST_HEAD(fib6_walkers); #define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) -static inline void fib6_walker_link(struct fib6_walker_t *w) +static void fib6_walker_link(struct fib6_walker *w) { write_lock_bh(&fib6_walker_lock); list_add(&w->lh, &fib6_walkers); write_unlock_bh(&fib6_walker_lock); } -static inline void fib6_walker_unlink(struct fib6_walker_t *w) +static void fib6_walker_unlink(struct fib6_walker *w) { write_lock_bh(&fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&fib6_walker_lock); } -static __inline__ u32 fib6_new_sernum(void) + +static int fib6_new_sernum(struct net *net) { - u32 n = ++rt_sernum; - if ((__s32)n <= 0) - rt_sernum = n = 1; - return n; + int new, old; + + do { + old = atomic_read(&net->ipv6.fib6_sernum); + new = old < INT_MAX ? old + 1 : 1; + } while (atomic_cmpxchg(&net->ipv6.fib6_sernum, + old, new) != old); + return new; } +enum { + FIB6_NO_SERNUM_CHANGE = 0, +}; + /* * Auxiliary address test functions for the radix tree. * @@ -128,7 +126,7 @@ static __inline__ u32 fib6_new_sernum(void) # define BITOP_BE32_SWIZZLE 0 #endif -static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) +static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* @@ -142,7 +140,7 @@ static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -static __inline__ struct fib6_node *node_alloc(void) +static struct fib6_node *node_alloc(void) { struct fib6_node *fn; @@ -151,12 +149,12 @@ static __inline__ struct fib6_node *node_alloc(void) return fn; } -static __inline__ void node_free(struct fib6_node *fn) +static void node_free(struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); } -static __inline__ void rt6_release(struct rt6_info *rt) +static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) dst_free(&rt->dst); @@ -267,7 +265,7 @@ static void __net_init fib6_tables_init(struct net *net) #endif -static int fib6_dump_node(struct fib6_walker_t *w) +static int fib6_dump_node(struct fib6_walker *w) { int res; struct rt6_info *rt; @@ -287,7 +285,7 @@ static int fib6_dump_node(struct fib6_walker_t *w) static void fib6_dump_end(struct netlink_callback *cb) { - struct fib6_walker_t *w = (void *)cb->args[2]; + struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { @@ -310,7 +308,7 @@ static int fib6_dump_done(struct netlink_callback *cb) static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { - struct fib6_walker_t *w; + struct fib6_walker *w; int res; w = (void *)cb->args[2]; @@ -355,7 +353,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) unsigned int h, s_h; unsigned int e = 0, s_e; struct rt6_rtnl_dump_arg arg; - struct fib6_walker_t *w; + struct fib6_walker *w; struct fib6_table *tb; struct hlist_head *head; int res = 0; @@ -423,14 +421,13 @@ out: static struct fib6_node *fib6_add_1(struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, - int replace_required) + int replace_required, int sernum) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; - __u32 sernum = fib6_new_sernum(); RT6_TRACE("fib6_add_1\n"); @@ -627,7 +624,7 @@ insert_above: return ln; } -static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt) +static bool rt6_qualify_for_ecmp(struct rt6_info *rt) { return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == RTF_GATEWAY; @@ -643,7 +640,7 @@ static int fib6_commit_metrics(struct dst_entry *dst, if (dst->flags & DST_HOST) { mp = dst_metrics_write_ptr(dst); } else { - mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); if (!mp) return -ENOMEM; dst_init_metrics(dst, mp, 0); @@ -820,7 +817,7 @@ add: return 0; } -static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt) +static void fib6_start_gc(struct net *net, struct rt6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) @@ -848,6 +845,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, int err = -ENOMEM; int allow_create = 1; int replace_required = 0; + int sernum = fib6_new_sernum(info->nl_net); if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -860,7 +858,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, - replace_required); + replace_required, sernum); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; @@ -894,14 +892,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); sfn->fn_flags = RTN_ROOT; - sfn->fn_sernum = fib6_new_sernum(); + sfn->fn_sernum = sernum; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(sfn, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required); + allow_create, replace_required, sernum); if (IS_ERR(sn)) { /* If it is failed, discard just allocated @@ -920,7 +918,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required); + allow_create, replace_required, sernum); if (IS_ERR(sn)) { err = PTR_ERR(sn); @@ -1174,7 +1172,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, int children; int nstate; struct fib6_node *child, *pn; - struct fib6_walker_t *w; + struct fib6_walker *w; int iter = 0; for (;;) { @@ -1276,7 +1274,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, struct nl_info *info) { - struct fib6_walker_t *w; + struct fib6_walker *w; struct rt6_info *rt = *rtp; struct net *net = info->nl_net; @@ -1414,7 +1412,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * <0 -> walk is terminated by an error. */ -static int fib6_walk_continue(struct fib6_walker_t *w) +static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn; @@ -1498,7 +1496,7 @@ skip: } } -static int fib6_walk(struct fib6_walker_t *w) +static int fib6_walk(struct fib6_walker *w) { int res; @@ -1512,15 +1510,25 @@ static int fib6_walk(struct fib6_walker_t *w) return res; } -static int fib6_clean_node(struct fib6_walker_t *w) +static int fib6_clean_node(struct fib6_walker *w) { int res; struct rt6_info *rt; - struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w); + struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, }; + if (c->sernum != FIB6_NO_SERNUM_CHANGE && + w->node->fn_sernum != c->sernum) + w->node->fn_sernum = c->sernum; + + if (!c->func) { + WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); + w->leaf = NULL; + return 0; + } + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { res = c->func(rt, c->arg); if (res < 0) { @@ -1554,9 +1562,9 @@ static int fib6_clean_node(struct fib6_walker_t *w) static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct rt6_info *, void *arg), - int prune, void *arg) + bool prune, int sernum, void *arg) { - struct fib6_cleaner_t c; + struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; @@ -1564,14 +1572,16 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.w.count = 0; c.w.skip = 0; c.func = func; + c.sernum = sernum; c.arg = arg; c.net = net; fib6_walk(&c.w); } -void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), - void *arg) +static void __fib6_clean_all(struct net *net, + int (*func)(struct rt6_info *, void *), + int sernum, void *arg) { struct fib6_table *table; struct hlist_head *head; @@ -1583,13 +1593,19 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), hlist_for_each_entry_rcu(table, head, tb6_hlist) { write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, 0, arg); + func, false, sernum, arg); write_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } +void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), + void *arg) +{ + __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); +} + static int fib6_prune_clone(struct rt6_info *rt, void *arg) { if (rt->rt6i_flags & RTF_CACHE) { @@ -1602,7 +1618,15 @@ static int fib6_prune_clone(struct rt6_info *rt, void *arg) static void fib6_prune_clones(struct net *net, struct fib6_node *fn) { - fib6_clean_tree(net, fn, fib6_prune_clone, 1, NULL); + fib6_clean_tree(net, fn, fib6_prune_clone, true, + FIB6_NO_SERNUM_CHANGE, NULL); +} + +static void fib6_flush_trees(struct net *net) +{ + int new_sernum = fib6_new_sernum(net); + + __fib6_clean_all(net, NULL, new_sernum, NULL); } /* @@ -1788,6 +1812,8 @@ int __init fib6_init(void) NULL); if (ret) goto out_unregister_subsys; + + __fib6_flush_trees = fib6_flush_trees; out: return ret; @@ -1808,10 +1834,10 @@ void fib6_gc_cleanup(void) struct ipv6_route_iter { struct seq_net_private p; - struct fib6_walker_t w; + struct fib6_walker w; loff_t skip; struct fib6_table *tbl; - __u32 sernum; + int sernum; }; static int ipv6_route_seq_show(struct seq_file *seq, void *v) @@ -1839,7 +1865,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) return 0; } -static int ipv6_route_yield(struct fib6_walker_t *w) +static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; @@ -1960,7 +1986,7 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { - struct fib6_walker_t *w = &iter->w; + struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 4052694c6f2c..3dd7d4ebd7cd 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -136,7 +136,7 @@ static void ip6_fl_gc(unsigned long dummy) spin_lock(&ip6_fl_lock); - for (i=0; i<=FL_HASH_MASK; i++) { + for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; @@ -239,7 +239,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, /* Socket flowlabel lists */ -struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, __be32 label) +struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; struct ipv6_pinfo *np = inet6_sk(sk); @@ -259,7 +259,6 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, __be32 label) rcu_read_unlock_bh(); return NULL; } - EXPORT_SYMBOL_GPL(fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) @@ -293,11 +292,11 @@ void fl6_free_socklist(struct sock *sk) following rthdr. */ -struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space, - struct ip6_flowlabel * fl, - struct ipv6_txoptions * fopt) +struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, + struct ip6_flowlabel *fl, + struct ipv6_txoptions *fopt) { - struct ipv6_txoptions * fl_opt = fl->opt; + struct ipv6_txoptions *fl_opt = fl->opt; if (fopt == NULL || fopt->opt_flen == 0) return fl_opt; @@ -388,7 +387,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, goto done; msg.msg_controllen = olen; - msg.msg_control = (void*)(fl->opt+1); + msg.msg_control = (void *)(fl->opt+1); memset(&flowi6, 0, sizeof(flowi6)); err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, fl->opt, @@ -517,7 +516,7 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) struct net *net = sock_net(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_flowlabel_req freq; - struct ipv6_fl_socklist *sfl1=NULL; + struct ipv6_fl_socklist *sfl1 = NULL; struct ipv6_fl_socklist *sfl; struct ipv6_fl_socklist __rcu **sflp; struct ip6_flowlabel *fl, *fl1 = NULL; @@ -542,7 +541,7 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; - (sfl = rcu_dereference(*sflp))!=NULL; + (sfl = rcu_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3873181ed856..12c3c8ef3849 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -314,6 +314,8 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); t = ip6gre_tunnel_find(net, parms, ARPHRD_IP6GRE); + if (t && create) + return NULL; if (t || !create) return t; @@ -322,7 +324,8 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, else strcpy(name, "ip6gre%d"); - dev = alloc_netdev(sizeof(*t), name, ip6gre_tunnel_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, + ip6gre_tunnel_setup); if (!dev) return NULL; @@ -615,6 +618,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, int err = -1; u8 proto; struct sk_buff *new_skb; + __be16 protocol; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; @@ -723,15 +727,17 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, * Push down and install the IP header. */ ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); ipv6h->hop_limit = tunnel->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ((__be16 *)(ipv6h + 1))[0] = tunnel->parms.o_flags; - ((__be16 *)(ipv6h + 1))[1] = (dev->type == ARPHRD_ETHER) ? - htons(ETH_P_TEB) : skb->protocol; + protocol = (dev->type == ARPHRD_ETHER) ? + htons(ETH_P_TEB) : skb->protocol; + ((__be16 *)(ipv6h + 1))[1] = protocol; if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { __be32 *ptr = (__be32 *)(((u8 *)ipv6h) + tunnel->hlen - 4); @@ -752,6 +758,8 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } } + skb_set_inner_protocol(skb, protocol); + ip6tunnel_xmit(skb, dev); if (ndst) ip6_tnl_dst_store(tunnel, ndst); @@ -778,7 +786,7 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; + fl6.flowi6_proto = IPPROTO_GRE; dsfield = ipv4_get_dsfield(iph); @@ -828,7 +836,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; + fl6.flowi6_proto = IPPROTO_GRE; dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) @@ -1174,7 +1182,9 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen); __be16 *p = (__be16 *)(ipv6h+1); - ip6_flow_hdr(ipv6h, 0, t->fl.u.ip6.flowlabel); + ip6_flow_hdr(ipv6h, 0, + ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, false)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; @@ -1232,7 +1242,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->iflink = 0; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static int ip6gre_tunnel_init(struct net_device *dev) @@ -1323,7 +1333,8 @@ static int __net_init ip6gre_init_net(struct net *net) int err; ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", - ip6gre_tunnel_setup); + NET_NAME_UNKNOWN, + ip6gre_tunnel_setup); if (!ign->fb_tunnel_dev) { err = -ENOMEM; goto err_alloc_dev; @@ -1719,4 +1730,5 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); MODULE_ALIAS_RTNL_LINK("ip6gre"); +MODULE_ALIAS_RTNL_LINK("ip6gretap"); MODULE_ALIAS_NETDEV("ip6gre0"); diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 4578e23834f7..14dacc544c3e 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -13,7 +13,7 @@ static ip6_icmp_send_t __rcu *ip6_icmp_send; int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? - 0 : -EBUSY; + 0 : -EBUSY; } EXPORT_SYMBOL(inet6_register_icmp_sender); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 51d54dc376f3..a3084ab5df6c 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -15,8 +15,8 @@ */ /* Changes * - * Mitsuru KANDA @USAGI and - * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs(). + * Mitsuru KANDA @USAGI and + * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs(). */ #include <linux/errno.h> @@ -65,7 +65,7 @@ int ip6_rcv_finish(struct sk_buff *skb) int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct ipv6hdr *hdr; - u32 pkt_len; + u32 pkt_len; struct inet6_dev *idev; struct net *net = dev_net(skb->dev); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 65eda2a8af48..9034f76ae013 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -53,31 +53,6 @@ static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) return proto; } -static int ipv6_gso_send_check(struct sk_buff *skb) -{ - const struct ipv6hdr *ipv6h; - const struct net_offload *ops; - int err = -EINVAL; - - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - goto out; - - ipv6h = ipv6_hdr(skb); - __skb_pull(skb, sizeof(*ipv6h)); - err = -EPROTONOSUPPORT; - - ops = rcu_dereference(inet6_offloads[ - ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]); - - if (likely(ops && ops->callbacks.gso_send_check)) { - skb_reset_transport_header(skb); - err = ops->callbacks.gso_send_check(skb); - } - -out: - return err; -} - static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -244,7 +219,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, continue; iph2 = (struct ipv6hdr *)(p->data + off); - first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ; + first_word = *(__be32 *)iph ^ *(__be32 *)iph2; /* All fields must match except length and Traffic Class. * XXX skbs on the gro_list have all been parsed and pulled @@ -261,6 +236,9 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, /* flush if Traffic Class fields are different */ NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000)); NAPI_GRO_CB(p)->flush |= flush; + + /* Clear flush_id, there's really no concept of ID in IPv6. */ + NAPI_GRO_CB(p)->flush_id = 0; } NAPI_GRO_CB(skb)->flush |= flush; @@ -303,7 +281,6 @@ out_unlock: static struct packet_offload ipv6_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .callbacks = { - .gso_send_check = ipv6_gso_send_check, .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, @@ -312,8 +289,9 @@ static struct packet_offload ipv6_packet_offload __read_mostly = { static const struct net_offload sit_offload = { .callbacks = { - .gso_send_check = ipv6_gso_send_check, .gso_segment = ipv6_gso_segment, + .gro_receive = ipv6_gro_receive, + .gro_complete = ipv6_gro_complete, }, }; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 45702b8cd141..8e950c250ada 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -20,7 +20,7 @@ * etc. * * H. von Brand : Added missing #include <linux/string.h> - * Imran Patel : frag id should be in NBO + * Imran Patel : frag id should be in NBO * Kazunori MIYAZAWA @USAGI * : add ip6_append_data and related functions * for datagram xmit @@ -205,7 +205,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); - ip6_flow_hdr(hdr, tclass, fl6->flowlabel); + ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, + np->autoflowlabel)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -232,7 +233,6 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, kfree_skb(skb); return -EMSGSIZE; } - EXPORT_SYMBOL(ip6_xmit); static int ip6_call_ra_chain(struct sk_buff *skb, int sel) @@ -554,14 +554,14 @@ static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; - struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; struct ipv6hdr *tmp_hdr; struct frag_hdr *fh; unsigned int mtu, hlen, left, len; int hroom, troom; __be32 frag_id = 0; - int ptr, offset = 0, err=0; + int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; struct net *net = dev_net(skb_dst(skb)->dev); @@ -636,7 +636,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } __skb_pull(skb, hlen); - fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); + fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); __skb_push(skb, hlen); skb_reset_network_header(skb); memcpy(skb_network_header(skb), tmp_hdr, hlen); @@ -661,7 +661,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) if (frag) { frag->ip_summed = CHECKSUM_NONE; skb_reset_transport_header(frag); - fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); + fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr)); __skb_push(frag, hlen); skb_reset_network_header(frag); memcpy(skb_network_header(frag), tmp_hdr, @@ -680,7 +680,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } err = output(skb); - if(!err) + if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); @@ -701,11 +701,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) return 0; } - while (frag) { - skb = frag->next; - kfree_skb(frag); - frag = skb; - } + kfree_skb_list(frag); IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); @@ -741,7 +737,7 @@ slow_path: /* * Keep copying data until we run out. */ - while(left > 0) { + while (left > 0) { len = left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > mtu) @@ -802,8 +798,8 @@ slow_path: /* * Copy a block of the IP datagram. */ - if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len)) - BUG(); + BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), + len)); left -= len; fh->frag_off = htons(offset); @@ -864,7 +860,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, - * and MSG_DONTROUTE --ANK (980726) + * and MSG_DONTROUTE --ANK (980726) * * 1. ip6_rt_check(): If route was host route, * check that cached destination is current. @@ -1008,7 +1004,7 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, if (final_dst) fl6->daddr = *final_dst; - return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); + return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); @@ -1040,7 +1036,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, if (final_dst) fl6->daddr = *final_dst; - return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); + return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); @@ -1048,7 +1044,7 @@ static inline int ip6_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int mtu,unsigned int flags, + int transhdrlen, int mtu, unsigned int flags, struct rt6_info *rt) { @@ -1071,7 +1067,7 @@ static inline int ip6_ufo_append_data(struct sock *sk, skb_reserve(skb, hh_len); /* create space for UDP/IP header */ - skb_put(skb,fragheaderlen + transhdrlen); + skb_put(skb, fragheaderlen + transhdrlen); /* initialize network header pointer */ skb_reset_network_header(skb); @@ -1156,6 +1152,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int err; int offset = 0; __u8 tx_flags = 0; + u32 tskey = 0; if (flags&MSG_PROBE) return 0; @@ -1271,9 +1268,12 @@ emsgsize: } } - /* For UDP, check if TX timestamp is enabled */ - if (sk->sk_type == SOCK_DGRAM) + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { sock_tx_timestamp(sk, &tx_flags); + if (tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; + } /* * Let's try using as much space as possible. @@ -1381,12 +1381,6 @@ alloc_new_skb: sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; - else { - /* Only the initial fragment - * is time stamped. - */ - tx_flags = 0; - } } if (skb == NULL) goto error; @@ -1400,8 +1394,11 @@ alloc_new_skb: skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); - if (sk->sk_type == SOCK_DGRAM) - skb_shinfo(skb)->tx_flags = tx_flags; + /* Only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = tx_flags; + tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; /* * Find where to start putting bytes @@ -1571,7 +1568,9 @@ int ip6_push_pending_frames(struct sock *sk) skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel); + ip6_flow_hdr(hdr, np->cork.tclass, + ip6_make_flowlabel(net, skb, fl6->flowlabel, + np->autoflowlabel)); hdr->hop_limit = np->cork.hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index afa082458360..9409887fb664 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -315,7 +315,8 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) else sprintf(name, "ip6tnl%%d"); - dev = alloc_netdev(sizeof (*t), name, ip6_tnl_dev_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, + ip6_tnl_dev_setup); if (dev == NULL) goto failed; @@ -363,8 +364,12 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net, (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr)) + ipv6_addr_equal(remote, &t->parms.raddr)) { + if (create) + return NULL; + return t; + } } if (!create) return NULL; @@ -407,12 +412,12 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw; __u8 nexthdr = ipv6h->nexthdr; - __u16 off = sizeof (*ipv6h); + __u16 off = sizeof(*ipv6h); while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { __u16 optlen = 0; struct ipv6_opt_hdr *hdr; - if (raw + off + sizeof (*hdr) > skb->data && + if (raw + off + sizeof(*hdr) > skb->data && !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr))) break; @@ -529,7 +534,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; - if ((len = sizeof (*ipv6h) + ntohs(ipv6h->payload_len)) > mtu) { + if ((len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len)) > mtu) { rel_type = ICMPV6_PKT_TOOBIG; rel_code = 0; rel_info = mtu; @@ -990,7 +995,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - sizeof (*ipv6h); + mtu = dst_mtu(dst) - sizeof(*ipv6h); if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1046,7 +1051,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), + ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; @@ -1081,7 +1087,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_IPIP; dsfield = ipv4_get_dsfield(iph); @@ -1133,7 +1139,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_IPV6; dsfield = ipv6_get_dsfield(ipv6h); @@ -1227,11 +1233,11 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) if (rt->dst.dev) { dev->hard_header_len = rt->dst.dev->hard_header_len + - sizeof (struct ipv6hdr); + sizeof(struct ipv6hdr); - dev->mtu = rt->dst.dev->mtu - sizeof (struct ipv6hdr); + dev->mtu = rt->dst.dev->mtu - sizeof(struct ipv6hdr); if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu-=8; + dev->mtu -= 8; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; @@ -1348,7 +1354,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { err = -EFAULT; break; } @@ -1360,7 +1366,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) { err = -EFAULT; } break; @@ -1370,7 +1376,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && @@ -1405,7 +1411,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); @@ -1480,14 +1486,14 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; - dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); - dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr); + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr); + dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr); t = netdev_priv(dev); if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu-=8; + dev->mtu -= 8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); @@ -1772,7 +1778,7 @@ static int __net_init ip6_tnl_init_net(struct net *net) err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", - ip6_tnl_dev_setup); + NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c new file mode 100644 index 000000000000..b04ed72c4542 --- /dev/null +++ b/net/ipv6/ip6_udp_tunnel.c @@ -0,0 +1,107 @@ +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/socket.h> +#include <linux/udp.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/in6.h> +#include <net/udp.h> +#include <net/udp_tunnel.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/ip6_tunnel.h> +#include <net/ip6_checksum.h> + +int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) +{ + struct sockaddr_in6 udp6_addr; + int err; + struct socket *sock = NULL; + + err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); + if (err < 0) + goto error; + + sk_change_net(sock->sk, net); + + udp6_addr.sin6_family = AF_INET6; + memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, + sizeof(udp6_addr.sin6_addr)); + udp6_addr.sin6_port = cfg->local_udp_port; + err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, + sizeof(udp6_addr)); + if (err < 0) + goto error; + + if (cfg->peer_udp_port) { + udp6_addr.sin6_family = AF_INET6; + memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, + sizeof(udp6_addr.sin6_addr)); + udp6_addr.sin6_port = cfg->peer_udp_port; + err = kernel_connect(sock, + (struct sockaddr *)&udp6_addr, + sizeof(udp6_addr), 0); + } + if (err < 0) + goto error; + + udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); + udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); + + *sockp = sock; + return 0; + +error: + if (sock) { + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); + } + *sockp = NULL; + return err; +} +EXPORT_SYMBOL_GPL(udp_sock_create6); + +int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst, + struct sk_buff *skb, struct net_device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be16 src_port, __be16 dst_port) +{ + struct udphdr *uh; + struct ipv6hdr *ip6h; + struct sock *sk = sock->sk; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + + uh->len = htons(skb->len); + uh->check = 0; + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED + | IPSKB_REROUTED); + skb_dst_set(skb, dst); + + udp6_set_csum(udp_get_no_check6_tx(sk), skb, &inet6_sk(sk)->saddr, + &sk->sk_v6_daddr, skb->len); + + __skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6h->payload_len = htons(skb->len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = ttl; + ip6h->daddr = *daddr; + ip6h->saddr = *saddr; + + ip6tunnel_xmit(skb, dev); + return 0; +} +EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 9aaa6bb229e4..d440bb585524 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -204,7 +204,7 @@ static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p else sprintf(name, "ip6_vti%%d"); - dev = alloc_netdev(sizeof(*t), name, vti6_dev_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup); if (dev == NULL) goto failed; @@ -253,8 +253,12 @@ static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p, (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr)) + ipv6_addr_equal(remote, &t->parms.raddr)) { + if (create) + return NULL; + return t; + } } if (!create) return NULL; @@ -803,7 +807,7 @@ static void vti6_dev_setup(struct net_device *dev) dev->mtu = ETH_DATA_LEN; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } /** @@ -1020,7 +1024,7 @@ static int __net_init vti6_init_net(struct net *net) err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0", - vti6_dev_setup); + NET_NAME_UNKNOWN, vti6_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; @@ -1089,36 +1093,26 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { **/ static int __init vti6_tunnel_init(void) { - int err; + const char *msg; + int err; + msg = "tunnel device"; err = register_pernet_device(&vti6_net_ops); if (err < 0) - goto out_pernet; + goto pernet_dev_failed; + msg = "tunnel protocols"; err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP); - if (err < 0) { - pr_err("%s: can't register vti6 protocol\n", __func__); - - goto out; - } - + if (err < 0) + goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH); - if (err < 0) { - xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); - pr_err("%s: can't register vti6 protocol\n", __func__); - - goto out; - } - + if (err < 0) + goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP); - if (err < 0) { - xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); - xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); - pr_err("%s: can't register vti6 protocol\n", __func__); - - goto out; - } + if (err < 0) + goto xfrm_proto_comp_failed; + msg = "netlink interface"; err = rtnl_link_register(&vti6_link_ops); if (err < 0) goto rtnl_link_failed; @@ -1127,11 +1121,14 @@ static int __init vti6_tunnel_init(void) rtnl_link_failed: xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); +xfrm_proto_comp_failed: xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); -out: +xfrm_proto_esp_failed: unregister_pernet_device(&vti6_net_ops); -out_pernet: +pernet_dev_failed: + pr_err("vti6 init: failed to register %s\n", msg); return err; } @@ -1141,13 +1138,9 @@ out_pernet: static void __exit vti6_tunnel_cleanup(void) { rtnl_link_unregister(&vti6_link_ops); - if (xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP)) - pr_info("%s: can't deregister protocol\n", __func__); - if (xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH)) - pr_info("%s: can't deregister protocol\n", __func__); - if (xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP)) - pr_info("%s: can't deregister protocol\n", __func__); - + xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); + xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); unregister_pernet_device(&vti6_net_ops); } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 8250474ab7dc..0171f08325c3 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -744,7 +744,7 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) else sprintf(name, "pim6reg%u", mrt->id); - dev = alloc_netdev(0, name, reg_vif_setup); + dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (dev == NULL) return NULL; @@ -845,7 +845,7 @@ static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c) atomic_dec(&mrt->cache_resolve_queue_len); - while((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) { + while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; @@ -1103,7 +1103,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, * Play the pending entries through our router */ - while((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { + while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index d1c793cffcb5..1b9316e1386a 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -181,8 +181,7 @@ static int ipcomp6_rcv_cb(struct sk_buff *skb, int err) return 0; } -static const struct xfrm_type ipcomp6_type = -{ +static const struct xfrm_type ipcomp6_type = { .description = "IPCOMP6", .owner = THIS_MODULE, .proto = IPPROTO_COMP, @@ -193,8 +192,7 @@ static const struct xfrm_type ipcomp6_type = .hdr_offset = xfrm6_find_1stfragopt, }; -static struct xfrm6_protocol ipcomp6_protocol = -{ +static struct xfrm6_protocol ipcomp6_protocol = { .handler = xfrm6_rcv, .cb_handler = ipcomp6_rcv_cb, .err_handler = ipcomp6_err, diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index edb58aff4ae7..e1a9583bb419 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -66,12 +66,12 @@ int ip6_ra_control(struct sock *sk, int sel) if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; - new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; write_lock_bh(&ip6_ra_lock); - for (rap = &ip6_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { - if (sel>=0) { + if (sel >= 0) { write_unlock_bh(&ip6_ra_lock); kfree(new_ra); return -EADDRINUSE; @@ -130,7 +130,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, int retv = -ENOPROTOOPT; if (optval == NULL) - val=0; + val = 0; else { if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) @@ -139,7 +139,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, val = 0; } - valbool = (val!=0); + valbool = (val != 0); if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); @@ -235,7 +235,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; - np->ipv6only = valbool; + sk->sk_ipv6only = valbool; retv = 0; break; @@ -474,7 +474,7 @@ sticky_done: goto done; msg.msg_controllen = optlen; - msg.msg_control = (void*)(opt+1); + msg.msg_control = (void *)(opt+1); retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, opt, &junk, &junk, &junk); @@ -687,7 +687,7 @@ done: retv = -ENOBUFS; break; } - gsf = kmalloc(optlen,GFP_KERNEL); + gsf = kmalloc(optlen, GFP_KERNEL); if (!gsf) { retv = -ENOBUFS; break; @@ -834,6 +834,10 @@ pref_skip_coa: np->dontfrag = valbool; retv = 0; break; + case IPV6_AUTOFLOWLABEL: + np->autoflowlabel = valbool; + retv = 0; + break; } release_sock(sk); @@ -869,7 +873,6 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, #endif return err; } - EXPORT_SYMBOL(ipv6_setsockopt); #ifdef CONFIG_COMPAT @@ -905,7 +908,6 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, #endif return err; } - EXPORT_SYMBOL(compat_ipv6_setsockopt); #endif @@ -917,7 +919,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, if (!opt) return 0; - switch(optname) { + switch (optname) { case IPV6_HOPOPTS: hdr = opt->hopopt; break; @@ -1058,7 +1060,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } case IPV6_V6ONLY: - val = np->ipv6only; + val = sk->sk_ipv6only; break; case IPV6_RECVPKTINFO: @@ -1158,7 +1160,6 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; - break; } case IPV6_TRANSPARENT: @@ -1273,13 +1274,17 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->dontfrag; break; + case IPV6_AUTOFLOWLABEL: + val = np->autoflowlabel; + break; + default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); - if(put_user(len, optlen)) + if (put_user(len, optlen)) return -EFAULT; - if(copy_to_user(optval,&val,len)) + if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } @@ -1292,7 +1297,7 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.getsockopt(sk, level, optname, optval, optlen); - if(level != SOL_IPV6) + if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0); @@ -1314,7 +1319,6 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, #endif return err; } - EXPORT_SYMBOL(ipv6_getsockopt); #ifdef CONFIG_COMPAT @@ -1357,7 +1361,6 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, #endif return err; } - EXPORT_SYMBOL(compat_ipv6_getsockopt); #endif diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 617f0958e164..9648de2b6745 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -64,15 +64,6 @@ #include <net/ip6_checksum.h> -/* Set to 3 to get tracing... */ -#define MCAST_DEBUG 2 - -#if MCAST_DEBUG >= 3 -#define MDBG(x) printk x -#else -#define MDBG(x) -#endif - /* Ensure that we have struct in6_addr aligned on 32bit word. */ static void *__mld2_query_bugs[] __attribute__((__unused__)) = { BUILD_BUG_ON_NULL(offsetof(struct mld2_query, mld2q_srcs) % 4), @@ -82,9 +73,6 @@ static void *__mld2_query_bugs[] __attribute__((__unused__)) = { static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; -/* Big mc list lock for all the sockets */ -static DEFINE_SPINLOCK(ipv6_sk_mc_lock); - static void igmp6_join_group(struct ifmcaddr6 *ma); static void igmp6_leave_group(struct ifmcaddr6 *ma); static void igmp6_timer_handler(unsigned long data); @@ -121,6 +109,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, #define IPV6_MLD_MAX_MSF 64 int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; +int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; /* * socket join on multicast group @@ -172,7 +161,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) mc_lst->next = NULL; mc_lst->addr = *addr; - rcu_read_lock(); + rtnl_lock(); if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(net, addr, NULL, 0, 0); @@ -181,10 +170,10 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) ip6_rt_put(rt); } } else - dev = dev_get_by_index_rcu(net, ifindex); + dev = __dev_get_by_index(net, ifindex); if (dev == NULL) { - rcu_read_unlock(); + rtnl_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; } @@ -201,17 +190,15 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) err = ipv6_dev_mc_inc(dev, addr); if (err) { - rcu_read_unlock(); + rtnl_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return err; } - spin_lock(&ipv6_sk_mc_lock); mc_lst->next = np->ipv6_mc_list; rcu_assign_pointer(np->ipv6_mc_list, mc_lst); - spin_unlock(&ipv6_sk_mc_lock); - rcu_read_unlock(); + rtnl_unlock(); return 0; } @@ -229,20 +216,17 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) if (!ipv6_addr_is_multicast(addr)) return -EINVAL; - spin_lock(&ipv6_sk_mc_lock); + rtnl_lock(); for (lnk = &np->ipv6_mc_list; - (mc_lst = rcu_dereference_protected(*lnk, - lockdep_is_held(&ipv6_sk_mc_lock))) !=NULL ; + (mc_lst = rtnl_dereference(*lnk)) != NULL; lnk = &mc_lst->next) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { struct net_device *dev; *lnk = mc_lst->next; - spin_unlock(&ipv6_sk_mc_lock); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, mc_lst->ifindex); + dev = __dev_get_by_index(net, mc_lst->ifindex); if (dev != NULL) { struct inet6_dev *idev = __in6_dev_get(dev); @@ -251,13 +235,14 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) __ipv6_dev_mc_dec(idev, &mc_lst->addr); } else (void) ip6_mc_leave_src(sk, mc_lst, NULL); - rcu_read_unlock(); + rtnl_unlock(); + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); return 0; } } - spin_unlock(&ipv6_sk_mc_lock); + rtnl_unlock(); return -EADDRNOTAVAIL; } @@ -302,16 +287,13 @@ void ipv6_sock_mc_close(struct sock *sk) if (!rcu_access_pointer(np->ipv6_mc_list)) return; - spin_lock(&ipv6_sk_mc_lock); - while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list, - lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) { + rtnl_lock(); + while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) { struct net_device *dev; np->ipv6_mc_list = mc_lst->next; - spin_unlock(&ipv6_sk_mc_lock); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, mc_lst->ifindex); + dev = __dev_get_by_index(net, mc_lst->ifindex); if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); @@ -320,14 +302,12 @@ void ipv6_sock_mc_close(struct sock *sk) __ipv6_dev_mc_dec(idev, &mc_lst->addr); } else (void) ip6_mc_leave_src(sk, mc_lst, NULL); - rcu_read_unlock(); atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); - spin_lock(&ipv6_sk_mc_lock); } - spin_unlock(&ipv6_sk_mc_lock); + rtnl_unlock(); } int ip6_mc_source(int add, int omode, struct sock *sk, @@ -390,7 +370,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) break; @@ -407,7 +387,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, /* update the interface filter */ ip6_mc_del_src(idev, group, omode, 1, source, 1); - for (j=i+1; j<psl->sl_count; j++) + for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; @@ -433,19 +413,19 @@ int ip6_mc_source(int add, int omode, struct sock *sk, newpsl->sl_max = count; newpsl->sl_count = count - IP6_SFBLOCK; if (psl) { - for (i=0; i<psl->sl_count; i++) + for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); } pmc->sflist = psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) /* There is an error in the address. */ goto done; } - for (j=psl->sl_count-1; j>=i; j--) + for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = *source; psl->sl_count++; @@ -514,7 +494,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) goto done; } newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc; - for (i=0; i<newpsl->sl_count; ++i) { + for (i = 0; i < newpsl->sl_count; ++i) { struct sockaddr_in6 *psin6; psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i]; @@ -576,9 +556,8 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, } err = -EADDRNOTAVAIL; - /* - * changes to the ipv6_mc_list require the socket lock and - * a read lock on ip6_sk_mc_lock. We have the socket lock, + /* changes to the ipv6_mc_list require the socket lock and + * rtnl lock. We have the socket lock and rcu read lock, * so reading the list is safe. */ @@ -602,11 +581,10 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { return -EFAULT; } - /* changes to psl require the socket lock, a read lock on - * on ipv6_sk_mc_lock and a write lock on pmc->sflock. We - * have the socket lock, so reading here is safe. + /* changes to psl require the socket lock, and a write lock + * on pmc->sflock. We have the socket lock so reading here is safe. */ - for (i=0; i<copycount; i++) { + for (i = 0; i < copycount; i++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; @@ -648,7 +626,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, } else { int i; - for (i=0; i<psl->sl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { if (ipv6_addr_equal(&psl->sl_addr[i], src_addr)) break; } @@ -663,14 +641,6 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, return rv; } -static void ma_put(struct ifmcaddr6 *mc) -{ - if (atomic_dec_and_test(&mc->mca_refcnt)) { - in6_dev_put(mc->idev); - kfree(mc); - } -} - static void igmp6_group_added(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; @@ -762,7 +732,7 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) pmc->mca_tomb = im->mca_tomb; pmc->mca_sources = im->mca_sources; im->mca_tomb = im->mca_sources = NULL; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) psf->sf_crcount = pmc->mca_crcount; } spin_unlock_bh(&im->mca_lock); @@ -780,7 +750,7 @@ static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) spin_lock_bh(&idev->mc_lock); pmc_prev = NULL; - for (pmc=idev->mc_tomb; pmc; pmc=pmc->next) { + for (pmc = idev->mc_tomb; pmc; pmc = pmc->next) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) break; pmc_prev = pmc; @@ -794,7 +764,7 @@ static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) spin_unlock_bh(&idev->mc_lock); if (pmc) { - for (psf=pmc->mca_tomb; psf; psf=psf_next) { + for (psf = pmc->mca_tomb; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -821,14 +791,14 @@ static void mld_clear_delrec(struct inet6_dev *idev) /* clear dead sources, too */ read_lock_bh(&idev->lock); - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { struct ip6_sf_list *psf, *psf_next; spin_lock_bh(&pmc->mca_lock); psf = pmc->mca_tomb; pmc->mca_tomb = NULL; spin_unlock_bh(&pmc->mca_lock); - for (; psf; psf=psf_next) { + for (; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -836,6 +806,48 @@ static void mld_clear_delrec(struct inet6_dev *idev) read_unlock_bh(&idev->lock); } +static void mca_get(struct ifmcaddr6 *mc) +{ + atomic_inc(&mc->mca_refcnt); +} + +static void ma_put(struct ifmcaddr6 *mc) +{ + if (atomic_dec_and_test(&mc->mca_refcnt)) { + in6_dev_put(mc->idev); + kfree(mc); + } +} + +static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, + const struct in6_addr *addr) +{ + struct ifmcaddr6 *mc; + + mc = kzalloc(sizeof(*mc), GFP_ATOMIC); + if (mc == NULL) + return NULL; + + setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); + + mc->mca_addr = *addr; + mc->idev = idev; /* reference taken by caller */ + mc->mca_users = 1; + /* mca_stamp should be updated upon changes */ + mc->mca_cstamp = mc->mca_tstamp = jiffies; + atomic_set(&mc->mca_refcnt, 1); + spin_lock_init(&mc->mca_lock); + + /* initial mode is (EX, empty) */ + mc->mca_sfmode = MCAST_EXCLUDE; + mc->mca_sfcount[MCAST_EXCLUDE] = 1; + + if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || + IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) + mc->mca_flags |= MAF_NOREPORT; + + return mc; +} /* * device multicast group inc (add if not found) @@ -845,6 +857,8 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) struct ifmcaddr6 *mc; struct inet6_dev *idev; + ASSERT_RTNL(); + /* we need to take a reference on idev */ idev = in6_dev_get(dev); @@ -869,38 +883,20 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) } } - /* - * not found: create a new one. - */ - - mc = kzalloc(sizeof(struct ifmcaddr6), GFP_ATOMIC); - - if (mc == NULL) { + mc = mca_alloc(idev, addr); + if (!mc) { write_unlock_bh(&idev->lock); in6_dev_put(idev); return -ENOMEM; } - setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); - - mc->mca_addr = *addr; - mc->idev = idev; /* (reference taken) */ - mc->mca_users = 1; - /* mca_stamp should be updated upon changes */ - mc->mca_cstamp = mc->mca_tstamp = jiffies; - atomic_set(&mc->mca_refcnt, 2); - spin_lock_init(&mc->mca_lock); - - /* initial mode is (EX, empty) */ - mc->mca_sfmode = MCAST_EXCLUDE; - mc->mca_sfcount[MCAST_EXCLUDE] = 1; - - if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || - IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) - mc->mca_flags |= MAF_NOREPORT; - mc->next = idev->mc_list; idev->mc_list = mc; + + /* Hold this for the code below before we unlock, + * it is already exposed via idev->mc_list. + */ + mca_get(mc); write_unlock_bh(&idev->lock); mld_del_delrec(idev, &mc->mca_addr); @@ -916,8 +912,10 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifmcaddr6 *ma, **map; + ASSERT_RTNL(); + write_lock_bh(&idev->lock); - for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) { + for (map = &idev->mc_list; (ma = *map) != NULL; map = &ma->next) { if (ipv6_addr_equal(&ma->mca_addr, addr)) { if (--ma->mca_users == 0) { *map = ma->next; @@ -942,7 +940,7 @@ int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) struct inet6_dev *idev; int err; - rcu_read_lock(); + ASSERT_RTNL(); idev = __in6_dev_get(dev); if (!idev) @@ -950,7 +948,6 @@ int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) else err = __ipv6_dev_mc_dec(idev, addr); - rcu_read_unlock(); return err; } @@ -968,7 +965,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, idev = __in6_dev_get(dev); if (idev) { read_lock_bh(&idev->lock); - for (mc = idev->mc_list; mc; mc=mc->next) { + for (mc = idev->mc_list; mc; mc = mc->next) { if (ipv6_addr_equal(&mc->mca_addr, group)) break; } @@ -977,7 +974,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, struct ip6_sf_list *psf; spin_lock_bh(&mc->mca_lock); - for (psf=mc->mca_sources;psf;psf=psf->sf_next) { + for (psf = mc->mca_sources; psf; psf = psf->sf_next) { if (ipv6_addr_equal(&psf->sf_addr, src_addr)) break; } @@ -986,7 +983,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, psf->sf_count[MCAST_EXCLUDE] != mc->mca_sfcount[MCAST_EXCLUDE]; else - rv = mc->mca_sfcount[MCAST_EXCLUDE] !=0; + rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0; spin_unlock_bh(&mc->mca_lock); } else rv = true; /* don't filter unspecified source */ @@ -1077,10 +1074,10 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, int i, scount; scount = 0; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; i<nsrcs; i++) { + for (i = 0; i < nsrcs; i++) { /* skip inactive filters */ if (psf->sf_count[MCAST_INCLUDE] || pmc->mca_sfcount[MCAST_EXCLUDE] != @@ -1110,10 +1107,10 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, /* mark INCLUDE-mode sources */ scount = 0; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; i<nsrcs; i++) { + for (i = 0; i < nsrcs; i++) { if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { psf->sf_gsresp = 1; scount++; @@ -1191,15 +1188,16 @@ static void mld_update_qrv(struct inet6_dev *idev, * and SHOULD NOT be one. Catch this here if we ever run * into such a case in future. */ + const int min_qrv = min(MLD_QRV_DEFAULT, sysctl_mld_qrv); WARN_ON(idev->mc_qrv == 0); if (mlh2->mld2q_qrv > 0) idev->mc_qrv = mlh2->mld2q_qrv; - if (unlikely(idev->mc_qrv < 2)) { + if (unlikely(idev->mc_qrv < min_qrv)) { net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n", - idev->mc_qrv, MLD_QRV_DEFAULT); - idev->mc_qrv = MLD_QRV_DEFAULT; + idev->mc_qrv, min_qrv); + idev->mc_qrv = min_qrv; } } @@ -1239,7 +1237,7 @@ static void mld_update_qri(struct inet6_dev *idev, } static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, - unsigned long *max_delay) + unsigned long *max_delay, bool v1_query) { unsigned long mldv1_md; @@ -1247,11 +1245,32 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, if (mld_in_v2_mode_only(idev)) return -EINVAL; - /* MLDv1 router present */ mldv1_md = ntohs(mld->mld_maxdelay); + + /* When in MLDv1 fallback and a MLDv2 router start-up being + * unaware of current MLDv1 operation, the MRC == MRD mapping + * only works when the exponential algorithm is not being + * used (as MLDv1 is unaware of such things). + * + * According to the RFC author, the MLDv2 implementations + * he's aware of all use a MRC < 32768 on start up queries. + * + * Thus, should we *ever* encounter something else larger + * than that, just assume the maximum possible within our + * reach. + */ + if (!v1_query) + mldv1_md = min(mldv1_md, MLDV1_MRD_MAX_COMPAT); + *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL); - mld_set_v1_mode(idev); + /* MLDv1 router present: we need to go into v1 mode *only* + * when an MLDv1 query is received as per section 9.12. of + * RFC3810! And we know from RFC2710 section 3.7 that MLDv1 + * queries MUST be of exactly 24 octets. + */ + if (v1_query) + mld_set_v1_mode(idev); /* cancel MLDv2 report timer */ mld_gq_stop_timer(idev); @@ -1266,10 +1285,6 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, unsigned long *max_delay) { - /* hosts need to stay in MLDv1 mode, discard MLDv2 queries */ - if (mld_in_v1_mode(idev)) - return -EINVAL; - *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); mld_update_qrv(idev, mld); @@ -1326,8 +1341,11 @@ int igmp6_event_query(struct sk_buff *skb) !(group_type&IPV6_ADDR_MULTICAST)) return -EINVAL; - if (len == MLD_V1_QUERY_LEN) { - err = mld_process_v1(idev, mld, &max_delay); + if (len < MLD_V1_QUERY_LEN) { + return -EINVAL; + } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) { + err = mld_process_v1(idev, mld, &max_delay, + len == MLD_V1_QUERY_LEN); if (err < 0) return err; } else if (len >= MLD_V2_QUERY_LEN_MIN) { @@ -1359,18 +1377,19 @@ int igmp6_event_query(struct sk_buff *skb) mlh2 = (struct mld2_query *)skb_transport_header(skb); mark = 1; } - } else + } else { return -EINVAL; + } read_lock_bh(&idev->lock); if (group_type == IPV6_ADDR_ANY) { - for (ma = idev->mc_list; ma; ma=ma->next) { + for (ma = idev->mc_list; ma; ma = ma->next) { spin_lock_bh(&ma->mca_lock); igmp6_group_queried(ma, max_delay); spin_unlock_bh(&ma->mca_lock); } } else { - for (ma = idev->mc_list; ma; ma=ma->next) { + for (ma = idev->mc_list; ma; ma = ma->next) { if (!ipv6_addr_equal(group, &ma->mca_addr)) continue; spin_lock_bh(&ma->mca_lock); @@ -1434,7 +1453,7 @@ int igmp6_event_report(struct sk_buff *skb) */ read_lock_bh(&idev->lock); - for (ma = idev->mc_list; ma; ma=ma->next) { + for (ma = idev->mc_list; ma; ma = ma->next) { if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { spin_lock(&ma->mca_lock); if (del_timer(&ma->mca_timer)) @@ -1498,7 +1517,7 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) struct ip6_sf_list *psf; int scount = 0; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; @@ -1712,7 +1731,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } first = 1; psf_prev = NULL; - for (psf=*psf_list; psf; psf=psf_next) { + for (psf = *psf_list; psf; psf = psf_next) { struct in6_addr *psrc; psf_next = psf->sf_next; @@ -1791,7 +1810,7 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) read_lock_bh(&idev->lock); if (!pmc) { - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { if (pmc->mca_flags & MAF_NOREPORT) continue; spin_lock_bh(&pmc->mca_lock); @@ -1824,7 +1843,7 @@ static void mld_clear_zeros(struct ip6_sf_list **ppsf) struct ip6_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; - for (psf=*ppsf; psf; psf = psf_next) { + for (psf = *ppsf; psf; psf = psf_next) { psf_next = psf->sf_next; if (psf->sf_crcount == 0) { if (psf_prev) @@ -1848,7 +1867,7 @@ static void mld_send_cr(struct inet6_dev *idev) /* deleted MCA's */ pmc_prev = NULL; - for (pmc=idev->mc_tomb; pmc; pmc=pmc_next) { + for (pmc = idev->mc_tomb; pmc; pmc = pmc_next) { pmc_next = pmc->next; if (pmc->mca_sfmode == MCAST_INCLUDE) { type = MLD2_BLOCK_OLD_SOURCES; @@ -1881,7 +1900,7 @@ static void mld_send_cr(struct inet6_dev *idev) spin_unlock(&idev->mc_lock); /* change recs */ - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) { type = MLD2_BLOCK_OLD_SOURCES; @@ -2018,7 +2037,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev) skb = NULL; read_lock_bh(&idev->lock); - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; @@ -2063,7 +2082,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, int rv = 0; psf_prev = NULL; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; @@ -2104,7 +2123,7 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; read_lock_bh(&idev->lock); - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } @@ -2124,7 +2143,7 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_sfcount[sfmode]--; } err = 0; - for (i=0; i<sfcount; i++) { + for (i = 0; i < sfcount; i++) { int rv = ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]); changerec |= rv > 0; @@ -2140,7 +2159,7 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_sfmode = MCAST_INCLUDE; pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; - for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; mld_ifc_event(pmc->idev); } else if (sf_setstate(pmc) || changerec) @@ -2159,7 +2178,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, struct ip6_sf_list *psf, *psf_prev; psf_prev = NULL; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; @@ -2184,7 +2203,7 @@ static void sf_markstate(struct ifmcaddr6 *pmc) struct ip6_sf_list *psf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) if (pmc->mca_sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && @@ -2201,7 +2220,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) int new_in, rv; rv = 0; - for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; @@ -2211,8 +2230,8 @@ static int sf_setstate(struct ifmcaddr6 *pmc) if (!psf->sf_oldin) { struct ip6_sf_list *prev = NULL; - for (dpsf=pmc->mca_tomb; dpsf; - dpsf=dpsf->sf_next) { + for (dpsf = pmc->mca_tomb; dpsf; + dpsf = dpsf->sf_next) { if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; @@ -2234,7 +2253,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) * add or update "delete" records if an active filter * is now inactive */ - for (dpsf=pmc->mca_tomb; dpsf; dpsf=dpsf->sf_next) + for (dpsf = pmc->mca_tomb; dpsf; dpsf = dpsf->sf_next) if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; @@ -2268,7 +2287,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; read_lock_bh(&idev->lock); - for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + for (pmc = idev->mc_list; pmc; pmc = pmc->next) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } @@ -2284,7 +2303,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!delta) pmc->mca_sfcount[sfmode]++; err = 0; - for (i=0; i<sfcount; i++) { + for (i = 0; i < sfcount; i++) { err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; @@ -2294,7 +2313,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!delta) pmc->mca_sfcount[sfmode]--; - for (j=0; j<i; j++) + for (j = 0; j < i; j++) ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { struct ip6_sf_list *psf; @@ -2308,7 +2327,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; - for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + for (psf = pmc->mca_sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; mld_ifc_event(idev); } else if (sf_setstate(pmc)) @@ -2322,12 +2341,12 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *nextpsf; - for (psf=pmc->mca_tomb; psf; psf=nextpsf) { + for (psf = pmc->mca_tomb; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } pmc->mca_tomb = NULL; - for (psf=pmc->mca_sources; psf; psf=nextpsf) { + for (psf = pmc->mca_sources; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } @@ -2366,7 +2385,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, { int err; - /* callers have the socket lock and a write lock on ipv6_sk_mc_lock, + /* callers have the socket lock and rtnl lock * so no other readers or writers of iml or its sflist */ if (!iml->sflist) { @@ -2471,13 +2490,21 @@ void ipv6_mc_down(struct inet6_dev *idev) mld_gq_stop_timer(idev); mld_dad_stop_timer(idev); - for (i = idev->mc_list; i; i=i->next) + for (i = idev->mc_list; i; i = i->next) igmp6_group_dropped(i); read_unlock_bh(&idev->lock); mld_clear_delrec(idev); } +static void ipv6_mc_reset(struct inet6_dev *idev) +{ + idev->mc_qrv = sysctl_mld_qrv; + idev->mc_qi = MLD_QI_DEFAULT; + idev->mc_qri = MLD_QRI_DEFAULT; + idev->mc_v1_seen = 0; + idev->mc_maxdelay = unsolicited_report_interval(idev); +} /* Device going up */ @@ -2488,7 +2515,8 @@ void ipv6_mc_up(struct inet6_dev *idev) /* Install multicast list, except for all-nodes (already installed) */ read_lock_bh(&idev->lock); - for (i = idev->mc_list; i; i=i->next) + ipv6_mc_reset(idev); + for (i = idev->mc_list; i; i = i->next) igmp6_group_added(i); read_unlock_bh(&idev->lock); } @@ -2508,13 +2536,7 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) (unsigned long)idev); setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire, (unsigned long)idev); - - idev->mc_qrv = MLD_QRV_DEFAULT; - idev->mc_qi = MLD_QI_DEFAULT; - idev->mc_qri = MLD_QRI_DEFAULT; - - idev->mc_maxdelay = unsolicited_report_interval(idev); - idev->mc_v1_seen = 0; + ipv6_mc_reset(idev); write_unlock_bh(&idev->lock); } diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index db9b6cbc9db3..f61429d391d3 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -336,11 +336,10 @@ static void mip6_destopt_destroy(struct xfrm_state *x) { } -static const struct xfrm_type mip6_destopt_type = -{ +static const struct xfrm_type mip6_destopt_type = { .description = "MIP6DESTOPT", .owner = THIS_MODULE, - .proto = IPPROTO_DSTOPTS, + .proto = IPPROTO_DSTOPTS, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR, .init_state = mip6_destopt_init_state, .destructor = mip6_destopt_destroy, @@ -469,11 +468,10 @@ static void mip6_rthdr_destroy(struct xfrm_state *x) { } -static const struct xfrm_type mip6_rthdr_type = -{ +static const struct xfrm_type mip6_rthdr_type = { .description = "MIP6RT", .owner = THIS_MODULE, - .proto = IPPROTO_ROUTING, + .proto = IPPROTO_ROUTING, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR, .init_state = mip6_rthdr_init_state, .destructor = mip6_rthdr_destroy, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index ca8d4ea48a5d..4cb45c1079a2 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -175,7 +175,7 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, type = cur->nd_opt_type; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); - } while(cur < end && cur->nd_opt_type != type); + } while (cur < end && cur->nd_opt_type != type); return cur <= end && cur->nd_opt_type == type ? cur : NULL; } @@ -192,7 +192,7 @@ static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); - } while(cur < end && !ndisc_is_useropt(cur)); + } while (cur < end && !ndisc_is_useropt(cur)); return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; } @@ -284,7 +284,6 @@ int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, } return -EINVAL; } - EXPORT_SYMBOL(ndisc_mc_map); static u32 ndisc_hash(const void *pkey, @@ -296,7 +295,7 @@ static u32 ndisc_hash(const void *pkey, static int ndisc_constructor(struct neighbour *neigh) { - struct in6_addr *addr = (struct in6_addr*)&neigh->primary_key; + struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key; struct net_device *dev = neigh->dev; struct inet6_dev *in6_dev; struct neigh_parms *parms; @@ -344,7 +343,7 @@ static int ndisc_constructor(struct neighbour *neigh) static int pndisc_constructor(struct pneigh_entry *n) { - struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; @@ -357,7 +356,7 @@ static int pndisc_constructor(struct pneigh_entry *n) static void pndisc_destructor(struct pneigh_entry *n) { - struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; @@ -1065,11 +1064,14 @@ static void ndisc_router_discovery(struct sk_buff *skb) int optlen; unsigned int pref = 0; - __u8 * opt = (__u8 *)(ra_msg + 1); + __u8 *opt = (__u8 *)(ra_msg + 1); optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); + ND_PRINTK(2, info, + "RA: %s, dev: %s\n", + __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); return; @@ -1102,13 +1104,21 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - if (!ipv6_accept_ra(in6_dev)) + if (!ipv6_accept_ra(in6_dev)) { + ND_PRINTK(2, info, + "RA: %s, did not accept ra for dev: %s\n", + __func__, skb->dev->name); goto skip_linkparms; + } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ - if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { + ND_PRINTK(2, info, + "RA: %s, nodetype is NODEFAULT, dev: %s\n", + __func__, skb->dev->name); goto skip_linkparms; + } #endif if (in6_dev->if_flags & IF_RS_SENT) { @@ -1130,11 +1140,24 @@ static void ndisc_router_discovery(struct sk_buff *skb) (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); - if (!in6_dev->cnf.accept_ra_defrtr) + if (!in6_dev->cnf.accept_ra_defrtr) { + ND_PRINTK(2, info, + "RA: %s, defrtr is false for dev: %s\n", + __func__, skb->dev->name); goto skip_defrtr; + } - if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + /* Do not accept RA with source-addr found on local machine unless + * accept_ra_from_local is set to true. + */ + if (!in6_dev->cnf.accept_ra_from_local && + ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0)) { + ND_PRINTK(2, info, + "RA from local address detected on dev: %s: default router ignored\n", + skb->dev->name); goto skip_defrtr; + } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); @@ -1163,8 +1186,10 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = NULL; } + ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, for dev: %s\n", + rt, lifetime, skb->dev->name); if (rt == NULL && lifetime) { - ND_PRINTK(3, dbg, "RA: adding default router\n"); + ND_PRINTK(3, info, "RA: adding default router\n"); rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); if (rt == NULL) { @@ -1260,12 +1285,22 @@ skip_linkparms: NEIGH_UPDATE_F_ISROUTER); } - if (!ipv6_accept_ra(in6_dev)) + if (!ipv6_accept_ra(in6_dev)) { + ND_PRINTK(2, info, + "RA: %s, accept_ra is false for dev: %s\n", + __func__, skb->dev->name); goto out; + } #ifdef CONFIG_IPV6_ROUTE_INFO - if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + if (!in6_dev->cnf.accept_ra_from_local && + ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, + NULL, 0)) { + ND_PRINTK(2, info, + "RA from local address detected on dev: %s: router info ignored.\n", + skb->dev->name); goto skip_routeinfo; + } if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; @@ -1283,7 +1318,7 @@ skip_linkparms: continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) continue; - rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3, + rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); } } @@ -1293,8 +1328,12 @@ skip_routeinfo: #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ - if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { + ND_PRINTK(2, info, + "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", + __func__, skb->dev->name); goto out; + } #endif if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { @@ -1312,7 +1351,7 @@ skip_routeinfo: __be32 n; u32 mtu; - memcpy(&n, ((u8*)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); + memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { @@ -1728,7 +1767,7 @@ int __init ndisc_init(void) #ifdef CONFIG_SYSCTL err = neigh_sysctl_register(NULL, &nd_tbl.parms, - &ndisc_ifinfo_sysctl_change); + ndisc_ifinfo_sysctl_change); if (err) goto out_unregister_pernet; out: diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 4bff1f297e39..6af874fc187f 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -40,9 +40,35 @@ config NFT_CHAIN_ROUTE_IPV6 fields such as the source, destination, flowlabel, hop-limit and the packet mark. +config NF_REJECT_IPV6 + tristate "IPv6 packet rejection" + default m if NETFILTER_ADVANCED=n + +config NFT_REJECT_IPV6 + depends on NF_TABLES_IPV6 + select NF_REJECT_IPV6 + default NFT_REJECT + tristate + +config NF_LOG_IPV6 + tristate "IPv6 packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + +config NF_NAT_IPV6 + tristate "IPv6 NAT" + depends on NF_CONNTRACK_IPV6 + depends on NETFILTER_ADVANCED + select NF_NAT + help + The IPv6 NAT option allows masquerading, port forwarding and other + forms of full Network Address Port Translation. This can be + controlled by iptables or nft. + +if NF_NAT_IPV6 + config NFT_CHAIN_NAT_IPV6 depends on NF_TABLES_IPV6 - depends on NF_NAT_IPV6 && NFT_NAT tristate "IPv6 nf_tables nat chain support" help This option enables the "nat" chain for IPv6 in nf_tables. This @@ -50,10 +76,22 @@ config NFT_CHAIN_NAT_IPV6 packet transformations such as the source, destination address and source and destination ports. -config NFT_REJECT_IPV6 +config NF_NAT_MASQUERADE_IPV6 + tristate "IPv6 masquerade support" + help + This is the kernel functionality to provide NAT in the masquerade + flavour (automatic source address selection) for IPv6. + +config NFT_MASQ_IPV6 + tristate "IPv6 masquerade support for nf_tables" depends on NF_TABLES_IPV6 - default NFT_REJECT - tristate + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV6 + help + This is the expression that provides IPv4 masquerading support for + nf_tables. + +endif # NF_NAT_IPV6 config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" @@ -175,6 +213,7 @@ config IP6_NF_FILTER config IP6_NF_TARGET_REJECT tristate "REJECT target support" depends on IP6_NF_FILTER + select NF_REJECT_IPV6 default m if NETFILTER_ADVANCED=n help The REJECT target allows a filtering rule to specify that an ICMPv6 @@ -227,22 +266,25 @@ config IP6_NF_SECURITY If unsure, say N. -config NF_NAT_IPV6 - tristate "IPv6 NAT" +config IP6_NF_NAT + tristate "ip6tables NAT support" depends on NF_CONNTRACK_IPV6 depends on NETFILTER_ADVANCED select NF_NAT + select NF_NAT_IPV6 + select NETFILTER_XT_NAT help - The IPv6 NAT option allows masquerading, port forwarding and other - forms of full Network Address Port Translation. It is controlled by - the `nat' table in ip6tables, see the man page for ip6tables(8). + This enables the `nat' table in ip6tables. This allows masquerading, + port forwarding and other forms of full Network Address Port + Translation. To compile it as a module, choose M here. If unsure, say N. -if NF_NAT_IPV6 +if IP6_NF_NAT config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" + select NF_NAT_MASQUERADE_IPV6 help Masquerading is a special case of NAT: all outgoing connections are changed to seem to come from a particular interface's address, and @@ -260,7 +302,7 @@ config IP6_NF_TARGET_NPT To compile it as a module, choose M here. If unsure, say N. -endif # NF_NAT_IPV6 +endif # IP6_NF_NAT endif # IP6_NF_IPTABLES diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 70d3dd66f2cd..fbb25f01143c 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o -obj-$(CONFIG_NF_NAT_IPV6) += ip6table_nat.o +obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o # objects for l3 independent conntrack nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o @@ -18,16 +18,24 @@ obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o +obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o # defrag nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o +# logging +obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o + +# reject +obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o + # nf_tables obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o +obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 3e4e92d5e157..7f9f45d829d2 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -19,33 +19,12 @@ #include <net/netfilter/nf_nat.h> #include <net/addrconf.h> #include <net/ipv6.h> +#include <net/netfilter/ipv6/nf_nat_masquerade.h> static unsigned int masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; - enum ip_conntrack_info ctinfo; - struct in6_addr src; - struct nf_conn *ct; - struct nf_nat_range newrange; - - ct = nf_ct_get(skb, &ctinfo); - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED_REPLY)); - - if (ipv6_dev_get_saddr(dev_net(par->out), par->out, - &ipv6_hdr(skb)->daddr, 0, &src) < 0) - return NF_DROP; - - nfct_nat(ct)->masq_index = par->out->ifindex; - - newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; - newrange.min_addr.in6 = src; - newrange.max_addr.in6 = src; - newrange.min_proto = range->min_proto; - newrange.max_proto = range->max_proto; - - return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); + return nf_nat_masquerade_ipv6(skb, par->targinfo, par->out); } static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) @@ -57,48 +36,6 @@ static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) return 0; } -static int device_cmp(struct nf_conn *ct, void *ifindex) -{ - const struct nf_conn_nat *nat = nfct_nat(ct); - - if (!nat) - return 0; - if (nf_ct_l3num(ct) != NFPROTO_IPV6) - return 0; - return nat->masq_index == (int)(long)ifindex; -} - -static int masq_device_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - const struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net *net = dev_net(dev); - - if (event == NETDEV_DOWN) - nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); - - return NOTIFY_DONE; -} - -static struct notifier_block masq_dev_notifier = { - .notifier_call = masq_device_event, -}; - -static int masq_inet_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct inet6_ifaddr *ifa = ptr; - struct netdev_notifier_info info; - - netdev_notifier_info_init(&info, ifa->idev->dev); - return masq_device_event(this, event, &info); -} - -static struct notifier_block masq_inet_notifier = { - .notifier_call = masq_inet_event, -}; - static struct xt_target masquerade_tg6_reg __read_mostly = { .name = "MASQUERADE", .family = NFPROTO_IPV6, @@ -115,17 +52,14 @@ static int __init masquerade_tg6_init(void) int err; err = xt_register_target(&masquerade_tg6_reg); - if (err == 0) { - register_netdevice_notifier(&masq_dev_notifier); - register_inet6addr_notifier(&masq_inet_notifier); - } + if (err == 0) + nf_nat_masquerade_ipv6_register_notifier(); return err; } static void __exit masquerade_tg6_exit(void) { - unregister_inet6addr_notifier(&masq_inet_notifier); - unregister_netdevice_notifier(&masq_dev_notifier); + nf_nat_masquerade_ipv6_unregister_notifier(); xt_unregister_target(&masquerade_tg6_reg); } diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 54bd9790603f..8b147440fbdc 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -94,7 +94,6 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) break; default: return false; - break; } nexthdr = hp->nexthdr; diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 387d8b8fc18d..b0634ac996b7 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -30,222 +30,57 @@ static const struct xt_table nf_nat_ipv6_table = { .af = NFPROTO_IPV6, }; -static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) -{ - /* Force range to this IP; let proto decide mapping for - * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). - */ - struct nf_nat_range range; - - range.flags = 0; - pr_debug("Allocating NULL binding for %p (%pI6)\n", ct, - HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip6 : - &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip6); - - return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); -} - -static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, - const struct net_device *in, - const struct net_device *out, - struct nf_conn *ct) +static unsigned int ip6table_nat_do_chain(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) { struct net *net = nf_ct_net(ct); - unsigned int ret; - ret = ip6t_do_table(skb, hooknum, in, out, net->ipv6.ip6table_nat); - if (ret == NF_ACCEPT) { - if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) - ret = alloc_null_binding(ct, hooknum); - } - return ret; + return ip6t_do_table(skb, ops->hooknum, in, out, net->ipv6.ip6table_nat); } -static unsigned int -nf_nat_ipv6_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - struct nf_conn_nat *nat; - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); - __be16 frag_off; - int hdrlen; - u8 nexthdr; - - ct = nf_ct_get(skb, &ctinfo); - /* Can't track? It's not due to stress, or conntrack would - * have dropped it. Hence it's the user's responsibilty to - * packet filter it out, or implement conntrack/NAT for that - * protocol. 8) --RR - */ - if (!ct) - return NF_ACCEPT; - - /* Don't try to NAT if this packet is not conntracked */ - if (nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - nexthdr = ipv6_hdr(skb)->nexthdr; - hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - - if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { - if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, - ops->hooknum, - hdrlen)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - unsigned int ret; - - ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); - if (ret != NF_ACCEPT) - return ret; - } else { - pr_debug("Already setup manip %s for ct %p\n", - maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", - ct); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - break; - - default: - /* ESTABLISHED */ - NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || - ctinfo == IP_CT_ESTABLISHED_REPLY); - if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) - goto oif_changed; - } - - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); - -oif_changed: - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_DROP; + return nf_nat_ipv6_fn(ops, skb, in, out, ip6table_nat_do_chain); } -static unsigned int -nf_nat_ipv6_in(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - unsigned int ret; - struct in6_addr daddr = ipv6_hdr(skb)->daddr; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) - skb_dst_drop(skb); - - return ret; + return nf_nat_ipv6_in(ops, skb, in, out, ip6table_nat_do_chain); } -static unsigned int -nf_nat_ipv6_out(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { -#ifdef CONFIG_XFRM - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - int err; -#endif - unsigned int ret; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, - &ct->tuplehash[!dir].tuple.dst.u3) || - (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && - ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all)) { - err = nf_xfrm_me_harder(skb, AF_INET6); - if (err < 0) - ret = NF_DROP_ERR(err); - } - } -#endif - return ret; + return nf_nat_ipv6_out(ops, skb, in, out, ip6table_nat_do_chain); } -static unsigned int -nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ip6table_nat_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - const struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned int ret; - int err; - - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, - &ct->tuplehash[!dir].tuple.src.u3)) { - err = ip6_route_me_harder(skb); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#ifdef CONFIG_XFRM - else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && - ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) { - err = nf_xfrm_me_harder(skb, AF_INET6); - if (err < 0) - ret = NF_DROP_ERR(err); - } -#endif - } - return ret; + return nf_nat_ipv6_local_fn(ops, skb, in, out, ip6table_nat_do_chain); } static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv6_in, + .hook = ip6table_nat_in, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, @@ -253,7 +88,7 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv6_out, + .hook = ip6table_nat_out, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, @@ -261,7 +96,7 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, /* Before packet filtering, change destination */ { - .hook = nf_nat_ipv6_local_fn, + .hook = ip6table_nat_local_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, @@ -269,7 +104,7 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv6_fn, + .hook = ip6table_nat_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0d5279fd852a..6f187c8d8a1b 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -50,6 +50,7 @@ #include <linux/module.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> +static const char nf_frags_cache_name[] = "nf-frags"; struct nf_ct_frag6_skb_cb { @@ -63,6 +64,8 @@ struct nf_ct_frag6_skb_cb static struct inet_frags nf_frags; #ifdef CONFIG_SYSCTL +static int zero; + static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", @@ -76,14 +79,17 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.nf_frag.frags.low_thresh }, { } }; @@ -102,7 +108,10 @@ static int nf_ct_frag6_sysctl_register(struct net *net) table[0].data = &net->nf_frag.frags.timeout; table[1].data = &net->nf_frag.frags.low_thresh; + table[1].extra2 = &net->nf_frag.frags.high_thresh; table[2].data = &net->nf_frag.frags.high_thresh; + table[2].extra1 = &net->nf_frag.frags.low_thresh; + table[2].extra2 = &init_net.nf_frag.frags.high_thresh; } hdr = register_net_sysctl(net, "net/netfilter", table); @@ -147,16 +156,13 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, const struct in6_addr *daddr) { - u32 c; - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); - c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, nf_frags.rnd); - return c & (INETFRAGS_HASHSZ - 1); + return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, nf_frags.rnd); } -static unsigned int nf_hashfn(struct inet_frag_queue *q) +static unsigned int nf_hashfn(const struct inet_frag_queue *q) { const struct frag_queue *nq; @@ -196,7 +202,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id, arg.dst = dst; arg.ecn = ecn; - read_lock_bh(&nf_frags.lock); + local_bh_disable(); hash = nf_hash_frag(id, src, dst); q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); @@ -217,7 +223,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, int offset, end; u8 ecn; - if (fq->q.last_in & INET_FRAG_COMPLETE) { + if (fq->q.flags & INET_FRAG_COMPLETE) { pr_debug("Already completed\n"); goto err; } @@ -248,11 +254,11 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * or have different end, the segment is corrupted. */ if (end < fq->q.len || - ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) { + ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) { pr_debug("already received last fragment\n"); goto err; } - fq->q.last_in |= INET_FRAG_LAST_IN; + fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. @@ -267,7 +273,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ - if (fq->q.last_in & INET_FRAG_LAST_IN) { + if (fq->q.flags & INET_FRAG_LAST_IN) { pr_debug("last packet already reached.\n"); goto err; } @@ -349,10 +355,9 @@ found: */ if (offset == 0) { fq->nhoffset = nhoff; - fq->q.last_in |= INET_FRAG_FIRST_IN; + fq->q.flags |= INET_FRAG_FIRST_IN; } - inet_frag_lru_move(&fq->q); return 0; discard_fq: @@ -597,10 +602,6 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) hdr = ipv6_hdr(clone); fhdr = (struct frag_hdr *)skb_transport_header(clone); - local_bh_disable(); - inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false); - local_bh_enable(); - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, ip6_frag_ecn(hdr)); if (fq == NULL) { @@ -617,7 +618,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) goto ret_orig; } - if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { ret_skb = nf_ct_frag6_reasm(fq, dev); if (ret_skb == NULL) @@ -677,13 +678,15 @@ int nf_ct_frag6_init(void) nf_frags.qsize = sizeof(struct frag_queue); nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; - nf_frags.secret_interval = 10 * 60 * HZ; - inet_frags_init(&nf_frags); - + nf_frags.frags_cache_name = nf_frags_cache_name; + ret = inet_frags_init(&nf_frags); + if (ret) + goto out; ret = register_pernet_subsys(&nf_ct_net_ops); if (ret) inet_frags_fini(&nf_frags); +out: return ret; } diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 7b9a748c6bac..e70382e4dfb5 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -40,7 +40,7 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (skb->nf_bridge && skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c new file mode 100644 index 000000000000..7b17a0be93e7 --- /dev/null +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -0,0 +1,417 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/xt_LOG.h> +#include <net/netfilter/nf_log.h> + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +/* One level of recursion won't kill us */ +static void dump_ipv6_packet(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int ip6hoff, + int recurse) +{ + u_int8_t currenthdr; + int fragment; + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); + if (ih == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ + nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + + /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ + nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, + ih->hop_limit, + (ntohl(*(__be32 *)ih) & 0x000fffff)); + + fragment = 0; + ptr = ip6hoff + sizeof(struct ipv6hdr); + currenthdr = ih->nexthdr; + while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr; + const struct ipv6_opt_hdr *hp; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (hp == NULL) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 48 "OPT (...) " */ + if (logflags & XT_LOG_IPOPT) + nf_log_buf_add(m, "OPT ( "); + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr; + const struct frag_hdr *fh; + + nf_log_buf_add(m, "FRAG:"); + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (fh == NULL) { + nf_log_buf_add(m, "TRUNCATED "); + return; + } + + /* Max length: 6 "65535 " */ + nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); + + /* Max length: 11 "INCOMPLETE " */ + if (fh->frag_off & htons(0x0001)) + nf_log_buf_add(m, "INCOMPLETE "); + + nf_log_buf_add(m, "ID:%08x ", + ntohl(fh->identification)); + + if (ntohs(fh->frag_off) & 0xFFF8) + fragment = 1; + + hdrlen = 8; + + break; + } + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + if (fragment) { + if (logflags & XT_LOG_IPOPT) + nf_log_buf_add(m, ")"); + return; + } + hdrlen = ipv6_optlen(hp); + break; + /* Max Length */ + case IPPROTO_AH: + if (logflags & XT_LOG_IPOPT) { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + /* Max length: 3 "AH " */ + nf_log_buf_add(m, "AH "); + + if (fragment) { + nf_log_buf_add(m, ")"); + return; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), + &_ahdr); + if (ah == NULL) { + /* + * Max length: 26 "INCOMPLETE [65535 + * bytes] )" + */ + nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 15 "SPI=0xF1234567 */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); + + } + + hdrlen = (hp->hdrlen+2)<<2; + break; + case IPPROTO_ESP: + if (logflags & XT_LOG_IPOPT) { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 4 "ESP " */ + nf_log_buf_add(m, "ESP "); + + if (fragment) { + nf_log_buf_add(m, ")"); + return; + } + + /* + * Max length: 26 "INCOMPLETE [65535 bytes] )" + */ + eh = skb_header_pointer(skb, ptr, sizeof(_esph), + &_esph); + if (eh == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 16 "SPI=0xF1234567 )" */ + nf_log_buf_add(m, "SPI=0x%x )", + ntohl(eh->spi)); + } + return; + default: + /* Max length: 20 "Unknown Ext Hdr 255" */ + nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr); + return; + } + if (logflags & XT_LOG_IPOPT) + nf_log_buf_add(m, ") "); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + + switch (currenthdr) { + case IPPROTO_TCP: + if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment, + ptr, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr)) + return; + break; + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h; + const struct icmp6hdr *ic; + + /* Max length: 13 "PROTO=ICMPv6 " */ + nf_log_buf_add(m, "PROTO=ICMPv6 "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); + if (ic == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - ptr); + return; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + nf_log_buf_add(m, "TYPE=%u CODE=%u ", + ic->icmp6_type, ic->icmp6_code); + + switch (ic->icmp6_type) { + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + nf_log_buf_add(m, "ID=%u SEQ=%u ", + ntohs(ic->icmp6_identifier), + ntohs(ic->icmp6_sequence)); + break; + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + break; + + case ICMPV6_PARAMPROB: + /* Max length: 17 "POINTER=ffffffff " */ + nf_log_buf_add(m, "POINTER=%08x ", + ntohl(ic->icmp6_pointer)); + /* Fall through */ + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + /* Max length: 3+maxlen */ + if (recurse) { + nf_log_buf_add(m, "["); + dump_ipv6_packet(m, info, skb, + ptr + sizeof(_icmp6h), 0); + nf_log_buf_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) { + nf_log_buf_add(m, "MTU=%u ", + ntohl(ic->icmp6_mtu)); + } + } + break; + } + /* Max length: 10 "PROTO=255 " */ + default: + nf_log_buf_add(m, "PROTO=%u ", currenthdr); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && recurse) + nf_log_dump_sk_uid_gid(m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (recurse && skb->mark) + nf_log_buf_add(m, "MARK=0x%x ", skb->mark); +} + +static void dump_ipv6_mac_header(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + nf_log_buf_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int len = dev->hard_header_len; + unsigned int i; + + if (dev->type == ARPHRD_SIT) { + p -= ETH_HLEN; + + if (p < skb->head) + p = NULL; + } + + if (p != NULL) { + nf_log_buf_add(m, "%02x", *p++); + for (i = 1; i < len; i++) + nf_log_buf_add(m, ":%02x", *p++); + } + nf_log_buf_add(m, " "); + + if (dev->type == ARPHRD_SIT) { + const struct iphdr *iph = + (struct iphdr *)skb_mac_header(skb); + nf_log_buf_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, + &iph->daddr); + } + } else { + nf_log_buf_add(m, " "); + } +} + +static void nf_log_ip6_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net)) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, + loginfo, prefix); + + if (in != NULL) + dump_ipv6_mac_header(m, loginfo, skb); + + dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_ip6_logger __read_mostly = { + .name = "nf_log_ipv6", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_ip6_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_ipv6_net_init(struct net *net) +{ + nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); + return 0; +} + +static void __net_exit nf_log_ipv6_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_ip6_logger); +} + +static struct pernet_operations nf_log_ipv6_net_ops = { + .init = nf_log_ipv6_net_init, + .exit = nf_log_ipv6_net_exit, +}; + +static int __init nf_log_ipv6_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_log_ipv6_net_ops); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_IPV6, &nf_ip6_logger); + return 0; +} + +static void __exit nf_log_ipv6_exit(void) +{ + unregister_pernet_subsys(&nf_log_ipv6_net_ops); + nf_log_unregister(&nf_ip6_logger); +} + +module_init(nf_log_ipv6_init); +module_exit(nf_log_ipv6_exit); + +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter IPv4 packet logging"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NF_LOGGER(AF_INET6, 0); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index abfe75a2e316..c5812e1c1ffb 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -158,6 +158,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, htons(oldlen), htons(datalen), 1); } +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) { @@ -175,6 +176,7 @@ static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], return 0; } +#endif static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = { .l3proto = NFPROTO_IPV6, @@ -183,7 +185,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = { .manip_pkt = nf_nat_ipv6_manip_pkt, .csum_update = nf_nat_ipv6_csum_update, .csum_recalc = nf_nat_ipv6_csum_recalc, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_ipv6_nlattr_to_range, +#endif #ifdef CONFIG_XFRM .decode_session = nf_nat_ipv6_decode_session, #endif @@ -257,6 +261,205 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); +unsigned int +nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + __be16 frag_off; + int hdrlen; + u8 nexthdr; + + ct = nf_ct_get(skb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + * have dropped it. Hence it's the user's responsibilty to + * packet filter it out, or implement conntrack/NAT for that + * protocol. 8) --RR + */ + if (!ct) + return NF_ACCEPT; + + /* Don't try to NAT if this packet is not conntracked */ + if (nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + nexthdr = ipv6_hdr(skb)->nexthdr; + hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, + ops->hooknum, + hdrlen)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + unsigned int ret; + + ret = do_chain(ops, skb, in, out, ct); + if (ret != NF_ACCEPT) + return ret; + + if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum))) + break; + + ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + if (ret != NF_ACCEPT) + return ret; + } else { + pr_debug("Already setup manip %s for ct %p\n", + maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + break; + + default: + /* ESTABLISHED */ + NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || + ctinfo == IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + +oif_changed: + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn); + +unsigned int +nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + unsigned int ret; + struct in6_addr daddr = ipv6_hdr(skb)->daddr; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) + skb_dst_drop(skb); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_in); + +unsigned int +nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ +#ifdef CONFIG_XFRM + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int err; +#endif + unsigned int ret; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3) || + (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && + ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all)) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } + } +#endif + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_out); + +unsigned int +nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + unsigned int (*do_chain)(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct)) +{ + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int ret; + int err; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, do_chain); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, + &ct->tuplehash[!dir].tuple.src.u3)) { + err = ip6_route_me_harder(skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#ifdef CONFIG_XFRM + else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && + ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#endif + } + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_ipv6_local_fn); + static int __init nf_nat_l3proto_ipv6_init(void) { int err; diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c new file mode 100644 index 000000000000..7745609665cd --- /dev/null +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6 + * NAT funded by Astaro. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/atomic.h> +#include <linux/netdevice.h> +#include <linux/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_nat.h> +#include <net/addrconf.h> +#include <net/ipv6.h> +#include <net/netfilter/ipv6/nf_nat_masquerade.h> + +unsigned int +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, + const struct net_device *out) +{ + enum ip_conntrack_info ctinfo; + struct in6_addr src; + struct nf_conn *ct; + struct nf_nat_range newrange; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + if (ipv6_dev_get_saddr(dev_net(out), out, + &ipv6_hdr(skb)->daddr, 0, &src) < 0) + return NF_DROP; + + nfct_nat(ct)->masq_index = out->ifindex; + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.in6 = src; + newrange.max_addr.in6 = src; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); + +static int device_cmp(struct nf_conn *ct, void *ifindex) +{ + const struct nf_conn_nat *nat = nfct_nat(ct); + + if (!nat) + return 0; + if (nf_ct_l3num(ct) != NFPROTO_IPV6) + return 0; + return nat->masq_index == (int)(long)ifindex; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + + if (event == NETDEV_DOWN) + nf_ct_iterate_cleanup(net, device_cmp, + (void *)(long)dev->ifindex, 0, 0); + + return NOTIFY_DONE; +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = ptr; + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, ifa->idev->dev); + return masq_device_event(this, event, &info); +} + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); + +void nf_nat_masquerade_ipv6_register_notifier(void) +{ + /* check if the notifier is already set */ + if (atomic_inc_return(&masquerade_notifier_refcount) > 1) + return; + + register_netdevice_notifier(&masq_dev_notifier); + register_inet6addr_notifier(&masq_inet_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier); + +void nf_nat_masquerade_ipv6_unregister_notifier(void) +{ + /* check if the notifier still has clients */ + if (atomic_dec_return(&masquerade_notifier_refcount) > 0) + return; + + unregister_inet6addr_notifier(&masq_inet_notifier); + unregister_netdevice_notifier(&masq_dev_notifier); +} +EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c new file mode 100644 index 000000000000..5f5f0438d74d --- /dev/null +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -0,0 +1,163 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/ip6_fib.h> +#include <net/ip6_checksum.h> +#include <linux/netfilter_ipv6.h> + +void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + struct tcphdr otcph, *tcph; + unsigned int otcplen, hh_len; + int tcphoff, needs_ack; + const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); + struct ipv6hdr *ip6h; +#define DEFAULT_TOS_VALUE 0x0U + const __u8 tclass = DEFAULT_TOS_VALUE; + struct dst_entry *dst = NULL; + u8 proto; + __be16 frag_off; + struct flowi6 fl6; + + if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || + (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { + pr_debug("addr is not unicast.\n"); + return; + } + + proto = oip6h->nexthdr; + tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto, &frag_off); + + if ((tcphoff < 0) || (tcphoff > oldskb->len)) { + pr_debug("Cannot get TCP header.\n"); + return; + } + + otcplen = oldskb->len - tcphoff; + + /* IP header checks: fragment, too short. */ + if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) { + pr_debug("proto(%d) != IPPROTO_TCP, " + "or too short. otcplen = %d\n", + proto, otcplen); + return; + } + + if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr))) + BUG(); + + /* No RST for RST. */ + if (otcph.rst) { + pr_debug("RST is set\n"); + return; + } + + /* Check checksum. */ + if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) { + pr_debug("TCP checksum is invalid\n"); + return; + } + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = oip6h->daddr; + fl6.daddr = oip6h->saddr; + fl6.fl6_sport = otcph.dest; + fl6.fl6_dport = otcph.source; + security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL || dst->error) { + dst_release(dst); + return; + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + return; + + hh_len = (dst->dev->hard_header_len + 15)&~15; + nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) + + sizeof(struct tcphdr) + dst->trailer_len, + GFP_ATOMIC); + + if (!nskb) { + net_dbg_ratelimited("cannot alloc skb\n"); + dst_release(dst); + return; + } + + skb_dst_set(nskb, dst); + + skb_reserve(nskb, hh_len + dst->header_len); + + skb_put(nskb, sizeof(struct ipv6hdr)); + skb_reset_network_header(nskb); + ip6h = ipv6_hdr(nskb); + ip6_flow_hdr(ip6h, tclass, 0); + ip6h->hop_limit = ip6_dst_hoplimit(dst); + ip6h->nexthdr = IPPROTO_TCP; + ip6h->saddr = oip6h->daddr; + ip6h->daddr = oip6h->saddr; + + skb_reset_transport_header(nskb); + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + /* Truncate to length (no data) */ + tcph->doff = sizeof(struct tcphdr)/4; + tcph->source = otcph.dest; + tcph->dest = otcph.source; + + if (otcph.ack) { + needs_ack = 0; + tcph->seq = otcph.ack_seq; + tcph->ack_seq = 0; + } else { + needs_ack = 1; + tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin + + otcplen - (otcph.doff<<2)); + tcph->seq = 0; + } + + /* Reset flags */ + ((u_int8_t *)tcph)[13] = 0; + tcph->rst = 1; + tcph->ack = needs_ack; + tcph->window = 0; + tcph->urg_ptr = 0; + tcph->check = 0; + + /* Adjust TCP checksum */ + tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, + &ipv6_hdr(nskb)->daddr, + sizeof(struct tcphdr), IPPROTO_TCP, + csum_partial(tcph, + sizeof(struct tcphdr), 0)); + + nf_ct_attach(nskb, oldskb); + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + /* If we use ip6_local_out for bridged traffic, the MAC source on + * the RST will be ours, instead of the destination's. This confuses + * some routers/firewalls, and they drop the packet. So we need to + * build the eth header using the original destination's MAC as the + * source, and send the RST packet directly. + */ + if (oldskb->nf_bridge) { + struct ethhdr *oeth = eth_hdr(oldskb); + nskb->dev = oldskb->nf_bridge->physindev; + nskb->protocol = htons(ETH_P_IPV6); + ip6h->payload_len = htons(sizeof(struct tcphdr)); + if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), + oeth->h_source, oeth->h_dest, nskb->len) < 0) + return; + dev_queue_xmit(nskb); + } else +#endif + ip6_local_out(nskb); +} +EXPORT_SYMBOL_GPL(nf_send_reset6); diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index d189fcb437fe..1c4b75dd425b 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -24,144 +24,53 @@ #include <net/netfilter/nf_nat_l3proto.h> #include <net/ipv6.h> -/* - * IPv6 NAT chains - */ - -static unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) { - enum ip_conntrack_info ctinfo; - struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - struct nf_conn_nat *nat; - enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); - __be16 frag_off; - int hdrlen; - u8 nexthdr; struct nft_pktinfo pkt; - unsigned int ret; - - if (ct == NULL || nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED + IP_CT_IS_REPLY: - nexthdr = ipv6_hdr(skb)->nexthdr; - hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - - if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { - if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, - ops->hooknum, - hdrlen)) - return NF_DROP; - else - return NF_ACCEPT; - } - /* Fall through */ - case IP_CT_NEW: - if (nf_nat_initialized(ct, maniptype)) - break; - - nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out); - ret = nft_do_chain(&pkt, ops); - if (ret != NF_ACCEPT) - return ret; - if (!nf_nat_initialized(ct, maniptype)) { - ret = nf_nat_alloc_null_binding(ct, ops->hooknum); - if (ret != NF_ACCEPT) - return ret; - } - default: - break; - } + nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out); - return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + return nft_do_chain(&pkt, ops); } -static unsigned int nf_nat_ipv6_prerouting(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv6_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - struct in6_addr daddr = ipv6_hdr(skb)->daddr; - unsigned int ret; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) - skb_dst_drop(skb); - - return ret; + return nf_nat_ipv6_fn(ops, skb, in, out, nft_nat_do_chain); } -static unsigned int nf_nat_ipv6_postrouting(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv6_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - enum ip_conntrack_info ctinfo __maybe_unused; - const struct nf_conn *ct __maybe_unused; - unsigned int ret; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); -#ifdef CONFIG_XFRM - if (ret != NF_DROP && ret != NF_STOLEN && - !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - - if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, - &ct->tuplehash[!dir].tuple.dst.u3) || - (ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all)) - if (nf_xfrm_me_harder(skb, AF_INET6) < 0) - ret = NF_DROP; - } -#endif - return ret; + return nf_nat_ipv6_in(ops, skb, in, out, nft_nat_do_chain); } -static unsigned int nf_nat_ipv6_output(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int nft_nat_ipv6_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - enum ip_conntrack_info ctinfo; - const struct nf_conn *ct; - unsigned int ret; - - ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); - if (ret != NF_DROP && ret != NF_STOLEN && - (ct = nf_ct_get(skb, &ctinfo)) != NULL) { - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + return nf_nat_ipv6_out(ops, skb, in, out, nft_nat_do_chain); +} - if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, - &ct->tuplehash[!dir].tuple.src.u3)) { - if (ip6_route_me_harder(skb)) - ret = NF_DROP; - } -#ifdef CONFIG_XFRM - else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - ct->tuplehash[dir].tuple.dst.u.all != - ct->tuplehash[!dir].tuple.src.u.all) - if (nf_xfrm_me_harder(skb, AF_INET6)) - ret = NF_DROP; -#endif - } - return ret; +static unsigned int nft_nat_ipv6_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return nf_nat_ipv6_local_fn(ops, skb, in, out, nft_nat_do_chain); } static const struct nf_chain_type nft_chain_nat_ipv6 = { @@ -174,10 +83,10 @@ static const struct nf_chain_type nft_chain_nat_ipv6 = { (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .hooks = { - [NF_INET_PRE_ROUTING] = nf_nat_ipv6_prerouting, - [NF_INET_POST_ROUTING] = nf_nat_ipv6_postrouting, - [NF_INET_LOCAL_OUT] = nf_nat_ipv6_output, - [NF_INET_LOCAL_IN] = nf_nat_ipv6_fn, + [NF_INET_PRE_ROUTING] = nft_nat_ipv6_in, + [NF_INET_POST_ROUTING] = nft_nat_ipv6_out, + [NF_INET_LOCAL_OUT] = nft_nat_ipv6_local_fn, + [NF_INET_LOCAL_IN] = nft_nat_ipv6_fn, }, }; diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c new file mode 100644 index 000000000000..556262f40761 --- /dev/null +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nft_masq.h> +#include <net/netfilter/ipv6/nf_nat_masquerade.h> + +static void nft_masq_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_masq *priv = nft_expr_priv(expr); + struct nf_nat_range range; + unsigned int verdict; + + range.flags = priv->flags; + + verdict = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); + + data[NFT_REG_VERDICT].verdict = verdict; +} + +static struct nft_expr_type nft_masq_ipv6_type; +static const struct nft_expr_ops nft_masq_ipv6_ops = { + .type = &nft_masq_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), + .eval = nft_masq_ipv6_eval, + .init = nft_masq_init, + .dump = nft_masq_dump, +}; + +static struct nft_expr_type nft_masq_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "masq", + .ops = &nft_masq_ipv6_ops, + .policy = nft_masq_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_masq_ipv6_module_init(void) +{ + int ret; + + ret = nft_register_expr(&nft_masq_ipv6_type); + if (ret < 0) + return ret; + + nf_nat_masquerade_ipv6_register_notifier(); + + return ret; +} + +static void __exit nft_masq_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_masq_ipv6_type); + nf_nat_masquerade_ipv6_unregister_notifier(); +} + +module_init(nft_masq_ipv6_module_init); +module_exit(nft_masq_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq"); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 5ec867e4a8b7..fc24c390af05 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -35,7 +35,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) if (found_rhdr) return offset; break; - default : + default: return offset; } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 3317440ea341..1752cd0b4882 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -8,7 +8,7 @@ * except it reports the sockets in the INET6 address family. * * Authors: David S. Miller (davem@caip.rutgers.edu) - * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> + * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -33,6 +33,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; + unsigned int frag_mem = ip6_frag_mem(net); seq_printf(seq, "TCP6: inuse %d\n", sock_prot_inuse_get(net, &tcpv6_prot)); @@ -42,8 +43,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); - seq_printf(seq, "FRAG6: inuse %d memory %d\n", - ip6_frag_nqueues(net), ip6_frag_mem(net)); + seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem); return 0; } diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index e048cf1bb6a2..e3770abe688a 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -51,6 +51,7 @@ EXPORT_SYMBOL(inet6_del_protocol); #endif const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(inet6_offloads); int inet6_add_offload(const struct net_offload *prot, unsigned char protocol) { diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index b2dc60b0c764..896af8807979 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -176,7 +176,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) goto out; net = dev_net(skb->dev); - sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, IP6CB(skb)->iif); + sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, inet6_iif(skb)); while (sk) { int filtered; @@ -220,7 +220,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) } } sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr, - IP6CB(skb)->iif); + inet6_iif(skb)); } out: read_unlock(&raw_v6_hashinfo.lock); @@ -375,7 +375,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, net = dev_net(skb->dev); while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr, - IP6CB(skb)->iif))) { + inet6_iif(skb)))) { rawv6_err(sk, skb, NULL, type, code, inner_offset, info); sk = sk_next(sk); @@ -506,7 +506,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, - IP6CB(skb)->iif); + inet6_iif(skb)); *addr_len = sizeof(*sin6); } @@ -588,8 +588,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, } offset += skb_transport_offset(skb); - if (skb_copy_bits(skb, offset, &csum, 2)) - BUG(); + BUG_ON(skb_copy_bits(skb, offset, &csum, 2)); /* in case cksum was not initialized */ if (unlikely(csum)) @@ -601,8 +600,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP) csum = CSUM_MANGLED_0; - if (skb_store_bits(skb, offset, &csum, 2)) - BUG(); + BUG_ON(skb_store_bits(skb, offset, &csum, 2)); send: err = ip6_push_pending_frames(sk); @@ -891,7 +889,7 @@ back_from_confirm: else { lock_sock(sk); err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, - len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info*)dst, + len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info *)dst, msg->msg_flags, dontfrag); if (err) @@ -904,7 +902,7 @@ done: dst_release(dst); out: fl6_sock_release(flowlabel); - return err<0?err:len; + return err < 0 ? err : len; do_confirm: dst_confirm(dst); if (!(msg->msg_flags & MSG_PROBE) || len) @@ -1047,7 +1045,7 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, struct raw6_sock *rp = raw6_sk(sk); int val, len; - if (get_user(len,optlen)) + if (get_user(len, optlen)) return -EFAULT; switch (optname) { @@ -1071,7 +1069,7 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval,&val,len)) + if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index cc85a9ba5010..1a157ca2ebc1 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -60,13 +60,14 @@ #include <net/inet_frag.h> #include <net/inet_ecn.h> -struct ip6frag_skb_cb -{ +static const char ip6_frag_cache_name[] = "ip6-frags"; + +struct ip6frag_skb_cb { struct inet6_skb_parm h; int offset; }; -#define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb)) +#define FRAG6_CB(skb) ((struct ip6frag_skb_cb *)((skb)->cb)) static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { @@ -85,27 +86,23 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, const struct in6_addr *daddr) { - u32 c; - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); - c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, ip6_frags.rnd); - - return c & (INETFRAGS_HASHSZ - 1); + return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, ip6_frags.rnd); } -static unsigned int ip6_hashfn(struct inet_frag_queue *q) +static unsigned int ip6_hashfn(const struct inet_frag_queue *q) { - struct frag_queue *fq; + const struct frag_queue *fq; fq = container_of(q, struct frag_queue, q); return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); } -bool ip6_frag_match(struct inet_frag_queue *q, void *a) +bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) { - struct frag_queue *fq; - struct ip6_create_arg *arg = a; + const struct frag_queue *fq; + const struct ip6_create_arg *arg = a; fq = container_of(q, struct frag_queue, q); return fq->id == arg->id && @@ -115,10 +112,10 @@ bool ip6_frag_match(struct inet_frag_queue *q, void *a) } EXPORT_SYMBOL(ip6_frag_match); -void ip6_frag_init(struct inet_frag_queue *q, void *a) +void ip6_frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); - struct ip6_create_arg *arg = a; + const struct ip6_create_arg *arg = a; fq->id = arg->id; fq->user = arg->user; @@ -135,7 +132,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, spin_lock(&fq->q.lock); - if (fq->q.last_in & INET_FRAG_COMPLETE) + if (fq->q.flags & INET_FRAG_COMPLETE) goto out; inet_frag_kill(&fq->q, frags); @@ -145,17 +142,20 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, if (!dev) goto out_rcu_unlock; - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + if (fq->q.flags & INET_FRAG_EVICTED) + goto out_rcu_unlock; + + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + /* Don't send error if the first segment did not arrive. */ - if (!(fq->q.last_in & INET_FRAG_FIRST_IN) || !fq->q.fragments) + if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments) goto out_rcu_unlock; - /* - But use as source device on which LAST ARRIVED - segment was received. And do not use fq->dev - pointer directly, device might already disappeared. + /* But use as source device on which LAST ARRIVED + * segment was received. And do not use fq->dev + * pointer directly, device might already disappeared. */ fq->q.fragments->dev = dev; icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); @@ -192,7 +192,6 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, arg.dst = dst; arg.ecn = ecn; - read_lock(&ip6_frags.lock); hash = inet6_hash_frag(id, src, dst); q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); @@ -212,7 +211,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct net *net = dev_net(skb_dst(skb)->dev); u8 ecn; - if (fq->q.last_in & INET_FRAG_COMPLETE) + if (fq->q.flags & INET_FRAG_COMPLETE) goto err; offset = ntohs(fhdr->frag_off) & ~0x7; @@ -243,9 +242,9 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, * or have different end, the segment is corrupted. */ if (end < fq->q.len || - ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) + ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) goto err; - fq->q.last_in |= INET_FRAG_LAST_IN; + fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. @@ -263,7 +262,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ - if (fq->q.last_in & INET_FRAG_LAST_IN) + if (fq->q.flags & INET_FRAG_LAST_IN) goto err; fq->q.len = end; } @@ -289,7 +288,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, goto found; } prev = NULL; - for(next = fq->q.fragments; next != NULL; next = next->next) { + for (next = fq->q.fragments; next != NULL; next = next->next) { if (FRAG6_CB(next)->offset >= offset) break; /* bingo! */ prev = next; @@ -338,10 +337,10 @@ found: */ if (offset == 0) { fq->nhoffset = nhoff; - fq->q.last_in |= INET_FRAG_FIRST_IN; + fq->q.flags |= INET_FRAG_FIRST_IN; } - if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { int res; unsigned long orefdst = skb->_skb_refdst; @@ -353,14 +352,13 @@ found: } skb_dst_drop(skb); - inet_frag_lru_move(&fq->q); return -1; discard_fq: inet_frag_kill(&fq->q, &ip6_frags); err: - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_REASMFAILS); + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -1; } @@ -523,7 +521,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); - int evicted; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; @@ -531,7 +528,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); /* Jumbo payload inhibits frag. header */ - if (hdr->payload_len==0) + if (hdr->payload_len == 0) goto fail_hdr; if (!pskb_may_pull(skb, (skb_transport_offset(skb) + @@ -552,11 +549,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false); - if (evicted) - IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_REASMFAILS, evicted); - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, ip6_frag_ecn(hdr)); if (fq != NULL) { @@ -576,32 +568,37 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return -1; fail_hdr: - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); return -1; } -static const struct inet6_protocol frag_protocol = -{ +static const struct inet6_protocol frag_protocol = { .handler = ipv6_frag_rcv, .flags = INET6_PROTO_NOPOLICY, }; #ifdef CONFIG_SYSCTL +static int zero; + static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .data = &init_net.ipv6.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &init_net.ipv6.frags.high_thresh }, { .procname = "ip6frag_time", @@ -613,10 +610,12 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { { } }; +/* secret interval has been deprecated */ +static int ip6_frags_secret_interval_unused; static struct ctl_table ip6_frags_ctl_table[] = { { .procname = "ip6frag_secret_interval", - .data = &ip6_frags.secret_interval, + .data = &ip6_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -636,7 +635,10 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) goto err_alloc; table[0].data = &net->ipv6.frags.high_thresh; + table[0].extra1 = &net->ipv6.frags.low_thresh; + table[0].extra2 = &init_net.ipv6.frags.high_thresh; table[1].data = &net->ipv6.frags.low_thresh; + table[1].extra2 = &net->ipv6.frags.high_thresh; table[2].data = &net->ipv6.frags.timeout; /* Don't export sysctls to unprivileged users */ @@ -746,8 +748,10 @@ int __init ipv6_frag_init(void) ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.match = ip6_frag_match; ip6_frags.frag_expire = ip6_frag_expire; - ip6_frags.secret_interval = 10 * 60 * HZ; - inet_frags_init(&ip6_frags); + ip6_frags.frags_cache_name = ip6_frag_cache_name; + ret = inet_frags_init(&ip6_frags); + if (ret) + goto err_pernet; out: return ret; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f23fbd28a501..a318dd89b6d9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -314,7 +314,6 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); - rt->rt6i_genid = rt_genid_ipv6(net); INIT_LIST_HEAD(&rt->rt6i_siblings); } return rt; @@ -813,7 +812,7 @@ out: } -struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6, +struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, int flags) { return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); @@ -843,7 +842,6 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, return NULL; } - EXPORT_SYMBOL(rt6_lookup); /* ip6_ins_rt is called with FREE table->tb6_lock. @@ -1024,7 +1022,7 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); } -struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, +struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, struct flowi6 *fl6) { int flags = 0; @@ -1041,7 +1039,6 @@ struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); } - EXPORT_SYMBOL(ip6_route_output); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) @@ -1098,9 +1095,6 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev))) - return NULL; - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) return NULL; @@ -1149,7 +1143,7 @@ static void ip6_link_failure(struct sk_buff *skb) static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu) { - struct rt6_info *rt6 = (struct rt6_info*)dst; + struct rt6_info *rt6 = (struct rt6_info *)dst; dst_confirm(dst); if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { @@ -1924,7 +1918,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, return NULL; read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0); + fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); if (!fn) goto out; @@ -1983,7 +1977,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev return NULL; read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) { + for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { if (dev == rt->dst.dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->rt6i_gateway, addr)) @@ -2068,7 +2062,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) struct in6_rtmsg rtmsg; int err; - switch(cmd) { + switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -2191,7 +2185,7 @@ int ip6_route_get_saddr(struct net *net, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt); + struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt); int err = 0; if (rt->rt6i_prefsrc.plen) *saddr = rt->rt6i_prefsrc.addr; @@ -2486,7 +2480,7 @@ beginning: return last_err; } -static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh) +static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct fib6_config cfg; int err; @@ -2501,7 +2495,7 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh) return ip6_route_del(&cfg); } -static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh) +static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) { struct fib6_config cfg; int err; @@ -2693,7 +2687,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) prefix, 0, NLM_F_MULTI); } -static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh) +static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 4f408176dc64..6eab37cf5345 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -101,19 +101,19 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || dev->ifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { if (remote == t->parms.iph.daddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || dev->ifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { if (local == t->parms.iph.saddr && - (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (!dev || !t->parms.link || dev->ifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } @@ -250,7 +250,8 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, else strcpy(name, "sit%d"); - dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); + dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, + ipip6_tunnel_setup); if (dev == NULL) return NULL; @@ -811,9 +812,9 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, const struct ipv6hdr *iph6 = ipv6_hdr(skb); u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - unsigned int max_headroom; /* The extra header space needed */ + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; struct flowi4 fl4; int mtu; @@ -821,6 +822,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, int addr_type; u8 ttl; int err; + u8 protocol = IPPROTO_IPV6; + int t_hlen = tunnel->hlen + sizeof(struct iphdr); if (skb->protocol != htons(ETH_P_IPV6)) goto tx_error; @@ -910,8 +913,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, goto tx_error; } + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); + if (IS_ERR(skb)) { + ip_rt_put(rt); + goto out; + } + if (df) { - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->dst) - t_hlen; if (mtu < 68) { dev->stats.collisions++; @@ -946,7 +955,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, /* * Okay, now see if we can stuff it in the buffer as-is. */ - max_headroom = LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr); + max_headroom = LL_RESERVED_SPACE(tdev) + t_hlen; if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { @@ -968,14 +977,15 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); - if (IS_ERR(skb)) { + if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) { ip_rt_put(rt); - goto out; + goto tx_error; } + skb_set_inner_ipproto(skb, IPPROTO_IPV6); + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, - IPPROTO_IPV6, tos, ttl, df, + protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; @@ -998,6 +1008,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (IS_ERR(skb)) goto out; + skb_set_inner_ipproto(skb, IPPROTO_IPIP); + ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP); return NETDEV_TX_OK; out: @@ -1058,8 +1070,10 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev) { + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); - dev->mtu = tdev->mtu - sizeof(struct iphdr); + dev->mtu = tdev->mtu - t_hlen; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; } @@ -1122,7 +1136,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, #endif static int -ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { int err = 0; struct ip_tunnel_parm p; @@ -1306,7 +1320,10 @@ done: static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) { - if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + + if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - t_hlen) return -EINVAL; dev->mtu = new_mtu; return 0; @@ -1337,14 +1354,17 @@ static void ipip6_dev_free(struct net_device *dev) static void ipip6_tunnel_setup(struct net_device *dev) { + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + dev->netdev_ops = &ipip6_netdev_ops; - dev->destructor = ipip6_dev_free; + dev->destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); - dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); + dev->hard_header_len = LL_MAX_HEADER + t_hlen; + dev->mtu = ETH_DATA_LEN - t_hlen; dev->flags = IFF_NOARP; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; @@ -1465,6 +1485,40 @@ static void ipip6_netlink_parms(struct nlattr *data[], } +/* This function returns true when ENCAP attributes are present in the nl msg */ +static bool ipip6_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_IPTUN_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); + } + + if (data[IFLA_IPTUN_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); + } + + if (data[IFLA_IPTUN_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]); + } + + if (data[IFLA_IPTUN_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]); + } + + return ret; +} + #ifdef CONFIG_IPV6_SIT_6RD /* This function returns true when 6RD attributes are present in the nl msg */ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], @@ -1508,12 +1562,20 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev, { struct net *net = dev_net(dev); struct ip_tunnel *nt; + struct ip_tunnel_encap ipencap; #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif int err; nt = netdev_priv(dev); + + if (ipip6_netlink_encap_parms(data, &ipencap)) { + err = ip_tunnel_encap_setup(nt, &ipencap); + if (err < 0) + return err; + } + ipip6_netlink_parms(data, &nt->parms); if (ipip6_tunnel_locate(net, &nt->parms, 0)) @@ -1536,15 +1598,23 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif + int err; if (dev == sitn->fb_tunnel_dev) return -EINVAL; + if (ipip6_netlink_encap_parms(data, &ipencap)) { + err = ip_tunnel_encap_setup(t, &ipencap); + if (err < 0) + return err; + } + ipip6_netlink_parms(data, &p); if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || @@ -1598,6 +1668,14 @@ static size_t ipip6_get_size(const struct net_device *dev) /* IFLA_IPTUN_6RD_RELAY_PREFIXLEN */ nla_total_size(2) + #endif + /* IFLA_IPTUN_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_DPORT */ + nla_total_size(2) + 0; } @@ -1629,6 +1707,16 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; #endif + if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, + tunnel->encap.type) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT, + tunnel->encap.sport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, + tunnel->encap.dport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, + tunnel->encap.dport)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -1650,6 +1738,10 @@ static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_6RD_PREFIXLEN] = { .type = NLA_U16 }, [IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = { .type = NLA_U16 }, #endif + [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, }; static void ipip6_dellink(struct net_device *dev, struct list_head *head) @@ -1729,6 +1821,7 @@ static int __net_init sit_init_net(struct net *net) sitn->tunnels[3] = sitn->tunnels_r_l; sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", + NET_NAME_UNKNOWN, ipip6_tunnel_setup); if (!sitn->fb_tunnel_dev) { err = -ENOMEM; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index a822b880689b..9a2838e93cc5 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -24,7 +24,7 @@ #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS]; +static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; /* RFC 2460, Section 8.3: * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] @@ -187,7 +187,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet6_reqsk_alloc(&tcp6_request_sock_ops); + req = inet_reqsk_alloc(&tcp6_request_sock_ops); if (!req) goto out; @@ -203,7 +203,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->ir_num = ntohs(th->dest); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (ipv6_opt_accepted(sk, skb) || + if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { atomic_inc(&skb->users); diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 058f3eca2e53..c5c10fafcfe2 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -16,6 +16,8 @@ #include <net/addrconf.h> #include <net/inet_frag.h> +static int one = 1; + static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", @@ -39,6 +41,13 @@ static struct ctl_table ipv6_table_template[] = { .proc_handler = proc_dointvec }, { + .procname = "auto_flowlabels", + .data = &init_net.ipv6.sysctl.auto_flowlabels, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "fwmark_reflect", .data = &init_net.ipv6.sysctl.fwmark_reflect, .maxlen = sizeof(int), @@ -56,6 +65,14 @@ static struct ctl_table ipv6_rotable[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "mld_qrv", + .data = &sysctl_mld_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, { } }; @@ -74,6 +91,8 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[0].data = &net->ipv6.sysctl.bindv6only; ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply; ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency; + ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels; + ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 229239ad96b1..cf2e45ab2fa4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -59,7 +59,6 @@ #include <net/snmp.h> #include <net/dsfield.h> #include <net/timewait_sock.h> -#include <net/netdma.h> #include <net/inet_common.h> #include <net/secure_seq.h> #include <net/tcp_memcontrol.h> @@ -93,13 +92,16 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - const struct rt6_info *rt = (const struct rt6_info *)dst; - dst_hold(dst); - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - if (rt->rt6i_node) - inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; + if (dst) { + const struct rt6_info *rt = (const struct rt6_info *)dst; + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (rt->rt6i_node) + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; + } } static void tcp_v6_hash(struct sock *sk) @@ -198,6 +200,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_v6_daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; + ip6_set_txhash(sk); + /* * TCP over IPv4 */ @@ -470,13 +474,14 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, - struct flowi6 *fl6, + struct flowi *fl, struct request_sock *req, u16 queue_mapping, struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; @@ -503,18 +508,6 @@ done: return err; } -static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) -{ - struct flowi6 fl6; - int res; - - res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL); - if (!res) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - } - return res; -} static void tcp_v6_reqsk_destructor(struct request_sock *req) { @@ -676,7 +669,8 @@ clear_hash_noput: return 1; } -static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +static int __tcp_v6_inbound_md5_hash(struct sock *sk, + const struct sk_buff *skb) { const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; @@ -716,24 +710,81 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) } return 0; } + +static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +{ + int ret; + + rcu_read_lock(); + ret = __tcp_v6_inbound_md5_hash(sk, skb); + rcu_read_unlock(); + + return ret; +} + #endif +static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, + struct sk_buff *skb) +{ + struct inet_request_sock *ireq = inet_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + + ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; + ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + + ireq->ir_iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq->ir_iif = inet6_iif(skb); + + if (!TCP_SKB_CB(skb)->tcp_tw_isn && + (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || + np->rxopt.bits.rxinfo || + np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || + np->rxopt.bits.rxohlim || np->repflow)) { + atomic_inc(&skb->users); + ireq->pktopts = skb; + } +} + +static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl, + const struct request_sock *req, + bool *strict) +{ + if (strict) + *strict = true; + return inet6_csk_route_req(sk, &fl->u.ip6, req); +} + struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), - .rtx_syn_ack = tcp_v6_rtx_synack, + .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, .syn_ack_timeout = tcp_syn_ack_timeout, }; -#ifdef CONFIG_TCP_MD5SIG static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { + .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - + sizeof(struct ipv6hdr), +#ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v6_reqsk_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, -}; #endif + .init_req = tcp_v6_init_req, +#ifdef CONFIG_SYN_COOKIES + .cookie_init_seq = cookie_v6_init_sequence, +#endif + .route_req = tcp_v6_route_req, + .init_seq = tcp_v6_init_sequence, + .send_synack = tcp_v6_send_synack, + .queue_hash_add = inet6_csk_reqsk_queue_hash_add, +}; static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, @@ -973,153 +1024,17 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) return sk; } -/* FIXME: this is substantially similar to the ipv4 code. - * Can some kind of merge be done? -- erics - */ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp_options_received tmp_opt; - struct request_sock *req; - struct inet_request_sock *ireq; - struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - __u32 isn = TCP_SKB_CB(skb)->when; - struct dst_entry *dst = NULL; - struct tcp_fastopen_cookie foc = { .len = -1 }; - bool want_cookie = false, fastopen; - struct flowi6 fl6; - int err; - if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; - if ((sysctl_tcp_syncookies == 2 || - inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6"); - if (!want_cookie) - goto drop; - } + return tcp_conn_request(&tcp6_request_sock_ops, + &tcp_request_sock_ipv6_ops, sk, skb); - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); - goto drop; - } - - req = inet6_reqsk_alloc(&tcp6_request_sock_ops); - if (req == NULL) - goto drop; - -#ifdef CONFIG_TCP_MD5SIG - tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; -#endif - - tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); - tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); - - if (want_cookie && !tmp_opt.saw_tstamp) - tcp_clear_options(&tmp_opt); - - tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb); - - ireq = inet_rsk(req); - ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; - ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (!want_cookie || tmp_opt.tstamp_ok) - TCP_ECN_create_request(req, skb, sock_net(sk)); - - ireq->ir_iif = sk->sk_bound_dev_if; - ireq->ir_mark = inet_request_mark(sk, skb); - - /* So that link locals have meaning */ - if (!sk->sk_bound_dev_if && - ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) - ireq->ir_iif = inet6_iif(skb); - - if (!isn) { - if (ipv6_opt_accepted(sk, skb) || - np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || - np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || - np->repflow) { - atomic_inc(&skb->users); - ireq->pktopts = skb; - } - - if (want_cookie) { - isn = cookie_v6_init_sequence(sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - goto have_isn; - } - - /* VJ's idea. We save last timestamp seen - * from the destination in peer table, when entering - * state TIME-WAIT, and check against it before - * accepting new connection request. - * - * If "isn" is not zero, this request hit alive - * timewait bucket, so that all the necessary checks - * are made in the function processing timewait state. - */ - if (tmp_opt.saw_tstamp && - tcp_death_row.sysctl_tw_recycle && - (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) { - if (!tcp_peer_is_proven(req, dst, true)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); - goto drop_and_release; - } - } - /* Kill the following clause, if you dislike this way. */ - else if (!sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst, false)) { - /* Without syncookies last quarter of - * backlog is filled with destinations, - * proven to be alive. - * It means that we continue to communicate - * to destinations, already remembered - * to the moment of synflood. - */ - LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n", - &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source)); - goto drop_and_release; - } - - isn = tcp_v6_init_sequence(skb); - } -have_isn: - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_release; - - if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL) - goto drop_and_free; - - tcp_rsk(req)->snt_isn = isn; - tcp_rsk(req)->snt_synack = tcp_time_stamp; - tcp_openreq_init_rwin(req, sk, dst); - fastopen = !want_cookie && - tcp_try_fastopen(sk, skb, req, &foc, dst); - err = tcp_v6_send_synack(sk, dst, &fl6, req, - skb_get_queue_mapping(skb), &foc); - if (!fastopen) { - if (err || want_cookie) - goto drop_and_free; - - tcp_rsk(req)->listener = NULL; - inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - } - return 0; - -drop_and_release: - dst_release(dst); -drop_and_free: - reqsk_free(req); drop: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return 0; /* don't send reset */ @@ -1235,6 +1150,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; + ip6_set_txhash(newsk); + /* Now IPv6 options... First: no IPv4 options. @@ -1346,11 +1263,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); -#ifdef CONFIG_TCP_MD5SIG - if (tcp_v6_inbound_md5_hash(sk, skb)) - goto discard; -#endif - if (sk_filter(sk, skb)) goto discard; @@ -1455,7 +1367,7 @@ ipv6_pktoptions: np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (np->repflow) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); - if (ipv6_opt_accepted(sk, opt_skb)) { + if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { skb_set_owner_r(opt_skb, sk); opt_skb = xchg(&np->pktoptions, opt_skb); } else { @@ -1499,11 +1411,19 @@ static int tcp_v6_rcv(struct sk_buff *skb) th = tcp_hdr(skb); hdr = ipv6_hdr(skb); + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), + sizeof(struct inet6_skb_parm)); + barrier(); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; @@ -1523,6 +1443,11 @@ process: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; +#ifdef CONFIG_TCP_MD5SIG + if (tcp_v6_inbound_md5_hash(sk, skb)) + goto discard_and_relse; +#endif + if (sk_filter(sk, skb)) goto discard_and_relse; @@ -1532,18 +1457,8 @@ process: bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { -#ifdef CONFIG_NET_DMA - struct tcp_sock *tp = tcp_sk(sk); - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - if (tp->ucopy.dma_chan) + if (!tcp_prequeue(sk, skb)) ret = tcp_v6_do_rcv(sk, skb); - else -#endif - { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); - } } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); @@ -1681,6 +1596,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, #endif + .mtu_reduced = tcp_v6_mtu_reduced, }; #ifdef CONFIG_TCP_MD5SIG @@ -1711,6 +1627,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, #endif + .mtu_reduced = tcp_v4_mtu_reduced, }; #ifdef CONFIG_TCP_MD5SIG @@ -1950,7 +1867,6 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .mtu_reduced = tcp_v6_mtu_reduced, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 01b0ff9a0c2c..c1ab77105b4c 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -15,54 +15,17 @@ #include <net/ip6_checksum.h> #include "ip6_offload.h" -static int tcp_v6_gso_send_check(struct sk_buff *skb) -{ - const struct ipv6hdr *ipv6h; - struct tcphdr *th; - - if (!pskb_may_pull(skb, sizeof(*th))) - return -EINVAL; - - ipv6h = ipv6_hdr(skb); - th = tcp_hdr(skb); - - th->check = 0; - skb->ip_summed = CHECKSUM_PARTIAL; - __tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr); - return 0; -} - static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - const struct ipv6hdr *iph = skb_gro_network_header(skb); - __wsum wsum; - /* Don't bother verifying checksum if we're going to flush anyway. */ - if (NAPI_GRO_CB(skb)->flush) - goto skip_csum; - - wsum = NAPI_GRO_CB(skb)->csum; - - switch (skb->ip_summed) { - case CHECKSUM_NONE: - wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), - wsum); - - /* fall through */ - - case CHECKSUM_COMPLETE: - if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr, - wsum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - + if (!NAPI_GRO_CB(skb)->flush && + skb_gro_checksum_validate(skb, IPPROTO_TCP, + ip6_gro_compute_pseudo)) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } -skip_csum: return tcp_gro_receive(head, skb); } @@ -78,10 +41,32 @@ static int tcp6_gro_complete(struct sk_buff *skb, int thoff) return tcp_gro_complete(skb); } +struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct tcphdr *th; + + if (!pskb_may_pull(skb, sizeof(*th))) + return ERR_PTR(-EINVAL); + + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + + /* Set up pseudo header, usually expect stack to have done + * this. + */ + + th->check = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr); + } + + return tcp_gso_segment(skb, features); +} static const struct net_offload tcpv6_offload = { .callbacks = { - .gso_send_check = tcp_v6_gso_send_check, - .gso_segment = tcp_gso_segment, + .gso_segment = tcp6_gso_segment, .gro_receive = tcp6_gro_receive, .gro_complete = tcp6_gro_complete, }, diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 2c4e4c5c7614..3c758007b327 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -15,7 +15,7 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. * * Authors Mitsuru KANDA <mk@linux-ipv6.org> - * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> + * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #define pr_fmt(fmt) "IPv6: " fmt @@ -64,7 +64,6 @@ err: return ret; } - EXPORT_SYMBOL(xfrm6_tunnel_register); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) @@ -92,7 +91,6 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) return ret; } - EXPORT_SYMBOL(xfrm6_tunnel_deregister); #define for_each_tunnel_rcu(head, handler) \ diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7092ff78fd84..f6ba535b6feb 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -79,7 +79,6 @@ static unsigned int udp6_ehashfn(struct net *net, int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) { const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - int sk_ipv6only = ipv6_only_sock(sk); int sk2_ipv6only = inet_v6_ipv6only(sk2); int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; @@ -95,7 +94,7 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) return 1; if (addr_type == IPV6_ADDR_ANY && - !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) + !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) return 1; if (sk2_rcv_saddr6 && @@ -244,7 +243,7 @@ begin: goto exact_match; } else if (score == badness && reuseport) { matches++; - if (((u64)hash * matches) >> 32 == 0) + if (reciprocal_scale(hash, matches) == 0) result = sk; hash = next_pseudo_random32(hash); } @@ -324,7 +323,7 @@ begin: } } else if (score == badness && reuseport) { matches++; - if (((u64)hash * matches) >> 32 == 0) + if (reciprocal_scale(hash, matches) == 0) result = sk; hash = next_pseudo_random32(hash); } @@ -374,8 +373,8 @@ EXPORT_SYMBOL_GPL(udp6_lib_lookup); /* - * This should be easy, if there is something there we - * return it, otherwise we block. + * This should be easy, if there is something there we + * return it, otherwise we block. */ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, @@ -473,7 +472,7 @@ try_again: sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, - IP6CB(skb)->iif); + inet6_iif(skb)); } *addr_len = sizeof(*sin6); } @@ -531,14 +530,18 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct in6_addr *saddr = &hdr->saddr; const struct in6_addr *daddr = &hdr->daddr; - struct udphdr *uh = (struct udphdr*)(skb->data+offset); + struct udphdr *uh = (struct udphdr *)(skb->data+offset); struct sock *sk; int err; + struct net *net = dev_net(skb->dev); - sk = __udp6_lib_lookup(dev_net(skb->dev), daddr, uh->dest, + sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), udptable); - if (sk == NULL) + if (sk == NULL) { + ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); return; + } if (type == ICMPV6_PKT_TOOBIG) { if (!ip6_sk_accept_pmtu(sk)) @@ -593,7 +596,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) static __inline__ void udpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, - u8 code, int offset, __be32 info ) + u8 code, int offset, __be32 info) { __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); } @@ -674,7 +677,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto csum_error; } - if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; @@ -703,43 +706,26 @@ drop: return -1; } -static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, - __be16 loc_port, const struct in6_addr *loc_addr, - __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif) +static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, + __be16 loc_port, const struct in6_addr *loc_addr, + __be16 rmt_port, const struct in6_addr *rmt_addr, + int dif, unsigned short hnum) { - struct hlist_nulls_node *node; - unsigned short num = ntohs(loc_port); - - sk_nulls_for_each_from(sk, node) { - struct inet_sock *inet = inet_sk(sk); - - if (!net_eq(sock_net(sk), net)) - continue; - - if (udp_sk(sk)->udp_port_hash == num && - sk->sk_family == PF_INET6) { - if (inet->inet_dport) { - if (inet->inet_dport != rmt_port) - continue; - } - if (!ipv6_addr_any(&sk->sk_v6_daddr) && - !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) - continue; - - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) - continue; + struct inet_sock *inet = inet_sk(sk); - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) - continue; - } - if (!inet6_mc_check(sk, loc_addr, rmt_addr)) - continue; - return sk; - } - } - return NULL; + if (!net_eq(sock_net(sk), net)) + return false; + + if (udp_sk(sk)->udp_port_hash != hnum || + sk->sk_family != PF_INET6 || + (inet->inet_dport && inet->inet_dport != rmt_port) || + (!ipv6_addr_any(&sk->sk_v6_daddr) && + !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || + (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + return false; + if (!inet6_mc_check(sk, loc_addr, rmt_addr)) + return false; + return true; } static void flush_stack(struct sock **stack, unsigned int count, @@ -763,6 +749,7 @@ static void flush_stack(struct sock **stack, unsigned int count, if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) skb1 = NULL; + sock_put(sk); } if (unlikely(skb1)) kfree_skb(skb1); @@ -788,43 +775,51 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, { struct sock *sk, *stack[256 / sizeof(struct sock *)]; const struct udphdr *uh = udp_hdr(skb); - struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); - int dif; - unsigned int i, count = 0; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(uh->dest); + struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); + int dif = inet6_iif(skb); + unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + + if (use_hash2) { + hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & + udp_table.mask; + hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask; +start_lookup: + hslot = &udp_table.hash2[hash2]; + offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); + } spin_lock(&hslot->lock); - sk = sk_nulls_head(&hslot->head); - dif = inet6_iif(skb); - sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - while (sk) { - /* If zero checksum and no_check is not on for - * the socket then skip it. - */ - if (uh->check || udp_sk(sk)->no_check6_rx) + sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { + if (__udp_v6_is_mcast_sock(net, sk, + uh->dest, daddr, + uh->source, saddr, + dif, hnum) && + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ + (uh->check || udp_sk(sk)->no_check6_rx)) { + if (unlikely(count == ARRAY_SIZE(stack))) { + flush_stack(stack, count, skb, ~0); + count = 0; + } stack[count++] = sk; - - sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, - uh->source, saddr, dif); - if (unlikely(count == ARRAY_SIZE(stack))) { - if (!sk) - break; - flush_stack(stack, count, skb, ~0); - count = 0; + sock_hold(sk); } } - /* - * before releasing the lock, we must take reference on sockets - */ - for (i = 0; i < count; i++) - sock_hold(stack[i]); spin_unlock(&hslot->lock); + /* Also lookup *:port if we are using hash2 and haven't done so yet. */ + if (use_hash2 && hash2 != hash2_any) { + hash2 = hash2_any; + goto start_lookup; + } + if (count) { flush_stack(stack, count, skb, count - 1); - - for (i = 0; i < count; i++) - sock_put(stack[i]); } else { kfree_skb(skb); } @@ -896,6 +891,10 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto csum_error; } + if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + ip6_compute_pseudo); + ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); @@ -965,10 +964,10 @@ static void udp_v6_flush_pending_frames(struct sock *sk) } /** - * udp6_hwcsum_outgoing - handle outgoing HW checksumming - * @sk: socket we are sending on - * @skb: sk_buff containing the filled-in UDP header - * (checksum field must be zeroed out) + * udp6_hwcsum_outgoing - handle outgoing HW checksumming + * @sk: socket we are sending on + * @skb: sk_buff containing the filled-in UDP header + * (checksum field must be zeroed out) */ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, const struct in6_addr *saddr, @@ -1299,7 +1298,7 @@ do_append_data: getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; err = ip6_append_data(sk, getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), hlimit, tclass, opt, &fl6, - (struct rt6_info*)dst, + (struct rt6_info *)dst, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag); if (err) udp_v6_flush_pending_frames(sk); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 0ae3d98f83e0..6b8f543f6ac6 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -10,34 +10,13 @@ * UDPv6 GSO support */ #include <linux/skbuff.h> +#include <linux/netdevice.h> #include <net/protocol.h> #include <net/ipv6.h> #include <net/udp.h> #include <net/ip6_checksum.h> #include "ip6_offload.h" -static int udp6_ufo_send_check(struct sk_buff *skb) -{ - const struct ipv6hdr *ipv6h; - struct udphdr *uh; - - if (!pskb_may_pull(skb, sizeof(*uh))) - return -EINVAL; - - if (likely(!skb->encapsulation)) { - ipv6h = ipv6_hdr(skb); - uh = udp_hdr(skb); - - uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, - IPPROTO_UDP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - } - - return 0; -} - static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { @@ -48,7 +27,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u8 *packet_start, *prevhdr; u8 nexthdr; u8 frag_hdr_sz = sizeof(struct frag_hdr); - int offset; __wsum csum; int tnl_hlen; @@ -80,15 +58,29 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) - segs = skb_udp_tunnel_segment(skb, features); + segs = skb_udp_tunnel_segment(skb, features, true); else { + const struct ipv6hdr *ipv6h; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot * do checksum of UDP packets sent as multiple IP fragments. */ - offset = skb_checksum_start_offset(skb); - csum = skb_checksum(skb, offset, skb->len - offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); + + uh = udp_hdr(skb); + ipv6h = ipv6_hdr(skb); + + uh->check = 0; + csum = skb_checksum(skb, 0, skb->len, 0); + uh->check = udp_v6_check(skb->len, &ipv6h->saddr, + &ipv6h->daddr, csum); + + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_NONE; /* Check if there is enough headroom to insert fragment header. */ @@ -127,10 +119,52 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, out: return segs; } + +static struct sk_buff **udp6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct udphdr *uh = udp_gro_udphdr(skb); + + if (unlikely(!uh)) + goto flush; + + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (NAPI_GRO_CB(skb)->flush) + goto skip; + + if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, + ip6_gro_compute_pseudo)) + goto flush; + else if (uh->check) + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + ip6_gro_compute_pseudo); + +skip: + NAPI_GRO_CB(skb)->is_ipv6 = 1; + return udp_gro_receive(head, skb, uh); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; +} + +static int udp6_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + + if (uh->check) + uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr, + &ipv6h->daddr, 0); + + return udp_gro_complete(skb, nhoff); +} + static const struct net_offload udpv6_offload = { .callbacks = { - .gso_send_check = udp6_ufo_send_check, .gso_segment = udp6_ufo_fragment, + .gro_receive = udp6_gro_receive, + .gro_complete = udp6_gro_complete, }, }; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index f8c3cf842f53..f48fbe4d16f5 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -3,8 +3,8 @@ * * Authors: * Mitsuru KANDA @USAGI - * Kazunori MIYAZAWA @USAGI - * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * YOSHIFUJI Hideaki @USAGI * IPv6 support */ @@ -52,7 +52,6 @@ int xfrm6_rcv(struct sk_buff *skb) return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0); } - EXPORT_SYMBOL(xfrm6_rcv); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, @@ -142,5 +141,4 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, drop: return -1; } - EXPORT_SYMBOL(xfrm6_input_addr); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 433672d07d0b..ca3f29b98ae5 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -25,7 +25,6 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, { return ip6_find_1stfragopt(skb, prevhdr); } - EXPORT_SYMBOL(xfrm6_find_1stfragopt); static int xfrm6_local_dontfrag(struct sk_buff *skb) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 2a0bbda2c76a..ac49f84fe2c3 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -3,11 +3,11 @@ * * Authors: * Mitsuru KANDA @USAGI - * Kazunori MIYAZAWA @USAGI - * Kunihiro Ishiguro <kunihiro@ipinfusion.com> - * IPv6 support - * YOSHIFUJI Hideaki - * Split up af-specific portion + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * IPv6 support + * YOSHIFUJI Hideaki + * Split up af-specific portion * */ @@ -84,7 +84,7 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { if (dst->ops->family == AF_INET6) { - struct rt6_info *rt = (struct rt6_info*)dst; + struct rt6_info *rt = (struct rt6_info *)dst; if (rt->rt6i_node) path->path_cookie = rt->rt6i_node->fn_sernum; } @@ -97,7 +97,7 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { - struct rt6_info *rt = (struct rt6_info*)xdst->route; + struct rt6_info *rt = (struct rt6_info *)xdst->route; xdst->u.dst.dev = dev; dev_hold(dev); @@ -296,7 +296,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .family = AF_INET6, .dst_ops = &xfrm6_dst_ops, .dst_lookup = xfrm6_dst_lookup, - .get_saddr = xfrm6_get_saddr, + .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, .init_dst = xfrm6_init_dst, @@ -319,9 +319,9 @@ static void xfrm6_policy_fini(void) static struct ctl_table xfrm6_policy_table[] = { { .procname = "xfrm6_gc_thresh", - .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, - .maxlen = sizeof(int), - .mode = 0644, + .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_dointvec, }, { } diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 3fc970135fc6..8a1f9c0d2a13 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -3,11 +3,11 @@ * * Authors: * Mitsuru KANDA @USAGI - * Kazunori MIYAZAWA @USAGI - * Kunihiro Ishiguro <kunihiro@ipinfusion.com> - * IPv6 support - * YOSHIFUJI Hideaki @USAGI - * Split up af-specific portion + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * IPv6 support + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion * */ @@ -45,10 +45,10 @@ xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, const xfrm_address_t *daddr, const xfrm_address_t *saddr) { x->id = tmpl->id; - if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) + if (ipv6_addr_any((struct in6_addr *)&x->id.daddr)) memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); - if (ipv6_addr_any((struct in6_addr*)&x->props.saddr)) + if (ipv6_addr_any((struct in6_addr *)&x->props.saddr)) memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); x->props.mode = tmpl->mode; x->props.reqid = tmpl->reqid; diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 1c66465a42dd..5743044cd660 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -15,7 +15,7 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. * * Authors Mitsuru KANDA <mk@linux-ipv6.org> - * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> + * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> * * Based on net/ipv4/xfrm4_tunnel.c * @@ -110,7 +110,6 @@ __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) rcu_read_unlock_bh(); return htonl(spi); } - EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi) @@ -187,7 +186,6 @@ __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) return htonl(spi); } - EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi); static void x6spi_destroy_rcu(struct rcu_head *head) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 54747c25c86c..92fafd485deb 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -674,7 +674,6 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name) self->daddr = DEV_ADDR_ANY; kfree(discoveries); return -EHOSTUNREACH; - break; } } /* Cleanup our copy of the discovery log */ diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index 2ba8b9705bb7..61ceb4cdb4a2 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -320,8 +320,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, __FILE__, __LINE__, tty->driver->name, port->count); spin_lock_irqsave(&port->lock, flags); - if (!tty_hung_up_p(filp)) - port->count--; + port->count--; port->blocked_open++; spin_unlock_irqrestore(&port->lock, flags); @@ -458,8 +457,7 @@ static int ircomm_tty_open(struct tty_struct *tty, struct file *filp) /* * If the port is the middle of closing, bail out now */ - if (tty_hung_up_p(filp) || - test_bit(ASYNCB_CLOSING, &self->port.flags)) { + if (test_bit(ASYNCB_CLOSING, &self->port.flags)) { /* Hm, why are we blocking on ASYNC_CLOSING if we * do return -EAGAIN/-ERESTARTSYS below anyway? diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c index 365b895da84b..9e0d909390fd 100644 --- a/net/irda/irda_device.c +++ b/net/irda/irda_device.c @@ -293,7 +293,8 @@ static void irda_device_setup(struct net_device *dev) */ struct net_device *alloc_irdadev(int sizeof_priv) { - return alloc_netdev(sizeof_priv, "irda%d", irda_device_setup); + return alloc_netdev(sizeof_priv, "irda%d", NET_NAME_UNKNOWN, + irda_device_setup); } EXPORT_SYMBOL(alloc_irdadev); diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c index 7ac4d1becbfc..5a2d0a695529 100644 --- a/net/irda/irlan/irlan_common.c +++ b/net/irda/irlan/irlan_common.c @@ -98,7 +98,7 @@ static const struct file_operations irlan_fops = { extern struct proc_dir_entry *proc_irda; #endif /* CONFIG_PROC_FS */ -static struct irlan_cb *irlan_open(__u32 saddr, __u32 daddr); +static struct irlan_cb __init *irlan_open(__u32 saddr, __u32 daddr); static void __irlan_close(struct irlan_cb *self); static int __irlan_insert_param(struct sk_buff *skb, char *param, int type, __u8 value_byte, __u16 value_short, @@ -196,7 +196,7 @@ static void __exit irlan_cleanup(void) * Open new instance of a client/provider, we should only register the * network device if this instance is ment for a particular client/provider */ -static struct irlan_cb *irlan_open(__u32 saddr, __u32 daddr) +static struct irlan_cb __init *irlan_open(__u32 saddr, __u32 daddr) { struct net_device *dev; struct irlan_cb *self; @@ -1024,7 +1024,6 @@ static int __irlan_insert_param(struct sk_buff *skb, char *param, int type, default: IRDA_DEBUG(2, "%s(), Unknown parameter type!\n", __func__ ); return 0; - break; } /* Insert at end of sk-buffer */ diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index ffcec225b5d9..dc13f1a45f2f 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -96,7 +96,7 @@ static void irlan_eth_setup(struct net_device *dev) */ struct net_device *alloc_irlandev(const char *name) { - return alloc_netdev(sizeof(struct irlan_cb), name, + return alloc_netdev(sizeof(struct irlan_cb), name, NET_NAME_UNKNOWN, irlan_eth_setup); } diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c index 9ea0c933b9ff..a37998c6273d 100644 --- a/net/irda/irlap_frame.c +++ b/net/irda/irlap_frame.c @@ -622,7 +622,7 @@ void irlap_send_rd_frame(struct irlap_cb *self) frame = (struct rd_frame *)skb_put(tx_skb, 2); frame->caddr = self->caddr; - frame->caddr = RD_RSP | PF_BIT; + frame->control = RD_RSP | PF_BIT; irlap_queue_xmit(self, tx_skb); } diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index 98ad6ec4bd3c..a5f28d421ea8 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c @@ -1426,7 +1426,8 @@ __u8 *irlmp_hint_to_service(__u8 *hint) if (hint[1] & HINT_TELEPHONY) { IRDA_DEBUG(1, "Telephony "); service[i++] = S_TELEPHONY; - } if (hint[1] & HINT_FILE_SERVER) + } + if (hint[1] & HINT_FILE_SERVER) IRDA_DEBUG(1, "File Server "); if (hint[1] & HINT_COMM) { diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 7a95fa4a3de1..a089b6b91650 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1103,7 +1103,6 @@ static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock, default: err = -EINVAL; goto out; - break; } } @@ -1543,7 +1542,8 @@ static int iucv_sock_shutdown(struct socket *sock, int how) sk->sk_shutdown |= how; if (how == RCV_SHUTDOWN || how == SHUTDOWN_MASK) { - if (iucv->transport == AF_IUCV_TRANS_IUCV) { + if ((iucv->transport == AF_IUCV_TRANS_IUCV) && + iucv->path) { err = pr_iucv->path_quiesce(iucv->path, NULL); if (err) err = -ENOTCONN; diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index da787930df0a..2a6a1fdd62c0 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -493,8 +493,8 @@ static void iucv_declare_cpu(void *data) err = "Paging or storage error"; break; } - pr_warning("Defining an interrupt buffer on CPU %i" - " failed with 0x%02x (%s)\n", cpu, rc, err); + pr_warn("Defining an interrupt buffer on CPU %i failed with 0x%02x (%s)\n", + cpu, rc, err); return; } @@ -1831,7 +1831,7 @@ static void iucv_external_interrupt(struct ext_code ext_code, BUG_ON(p->iptype < 0x01 || p->iptype > 0x09); work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC); if (!work) { - pr_warning("iucv_external_interrupt: out of memory\n"); + pr_warn("iucv_external_interrupt: out of memory\n"); return; } memcpy(&work->data, p, sizeof(work->data)); @@ -1974,8 +1974,7 @@ static int iucv_pm_restore(struct device *dev) printk(KERN_WARNING "iucv_pm_restore %p\n", iucv_path_table); #endif if ((iucv_pm_state != IUCV_PM_RESTORING) && iucv_path_table) - pr_warning("Suspending Linux did not completely close all IUCV " - "connections\n"); + pr_warn("Suspending Linux did not completely close all IUCV connections\n"); iucv_pm_state = IUCV_PM_RESTORING; if (cpumask_empty(&iucv_irq_cpumask)) { rc = iucv_query_maxconn(); diff --git a/net/key/af_key.c b/net/key/af_key.c index ba2a2f95911c..1847ec4e3930 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -405,7 +405,6 @@ static int verify_address_len(const void *p) * XXX When it can, remove this -EINVAL. -DaveM */ return -EINVAL; - break; } return 0; @@ -536,7 +535,6 @@ pfkey_satype2proto(uint8_t satype) return IPPROTO_ESP; case SADB_X_SATYPE_IPCOMP: return IPPROTO_COMP; - break; default: return 0; } @@ -553,7 +551,6 @@ pfkey_proto2satype(uint16_t proto) return SADB_SATYPE_ESP; case IPPROTO_COMP: return SADB_X_SATYPE_IPCOMP; - break; default: return 0; } diff --git a/net/l2tp/Kconfig b/net/l2tp/Kconfig index adb9843dd7cf..378c73b26093 100644 --- a/net/l2tp/Kconfig +++ b/net/l2tp/Kconfig @@ -6,6 +6,7 @@ menuconfig L2TP tristate "Layer Two Tunneling Protocol (L2TP)" depends on (IPV6 || IPV6=n) depends on INET + select NET_UDP_TUNNEL ---help--- Layer Two Tunneling Protocol diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index bea259043205..895348e44c7d 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -52,6 +52,7 @@ #include <net/dst.h> #include <net/ip.h> #include <net/udp.h> +#include <net/udp_tunnel.h> #include <net/inet_common.h> #include <net/xfrm.h> #include <net/protocol.h> @@ -147,7 +148,7 @@ do { \ atomic_read(&_t->ref_count)); \ l2tp_tunnel_inc_refcount_1(_t); \ } while (0) -#define l2tp_tunnel_dec_refcount(_t) +#define l2tp_tunnel_dec_refcount(_t) \ do { \ pr_debug("l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", \ __func__, __LINE__, (_t)->name, \ @@ -1358,81 +1359,46 @@ static int l2tp_tunnel_sock_create(struct net *net, { int err = -EINVAL; struct socket *sock = NULL; - struct sockaddr_in udp_addr = {0}; - struct sockaddr_l2tpip ip_addr = {0}; -#if IS_ENABLED(CONFIG_IPV6) - struct sockaddr_in6 udp6_addr = {0}; - struct sockaddr_l2tpip6 ip6_addr = {0}; -#endif + struct udp_port_cfg udp_conf; switch (cfg->encap) { case L2TP_ENCAPTYPE_UDP: + memset(&udp_conf, 0, sizeof(udp_conf)); + #if IS_ENABLED(CONFIG_IPV6) if (cfg->local_ip6 && cfg->peer_ip6) { - err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock); - if (err < 0) - goto out; - - sk_change_net(sock->sk, net); - - udp6_addr.sin6_family = AF_INET6; - memcpy(&udp6_addr.sin6_addr, cfg->local_ip6, - sizeof(udp6_addr.sin6_addr)); - udp6_addr.sin6_port = htons(cfg->local_udp_port); - err = kernel_bind(sock, (struct sockaddr *) &udp6_addr, - sizeof(udp6_addr)); - if (err < 0) - goto out; - - udp6_addr.sin6_family = AF_INET6; - memcpy(&udp6_addr.sin6_addr, cfg->peer_ip6, - sizeof(udp6_addr.sin6_addr)); - udp6_addr.sin6_port = htons(cfg->peer_udp_port); - err = kernel_connect(sock, - (struct sockaddr *) &udp6_addr, - sizeof(udp6_addr), 0); - if (err < 0) - goto out; - - if (cfg->udp6_zero_tx_checksums) - udp_set_no_check6_tx(sock->sk, true); - if (cfg->udp6_zero_rx_checksums) - udp_set_no_check6_rx(sock->sk, true); + udp_conf.family = AF_INET6; + memcpy(&udp_conf.local_ip6, cfg->local_ip6, + sizeof(udp_conf.local_ip6)); + memcpy(&udp_conf.peer_ip6, cfg->peer_ip6, + sizeof(udp_conf.peer_ip6)); + udp_conf.use_udp6_tx_checksums = + cfg->udp6_zero_tx_checksums; + udp_conf.use_udp6_rx_checksums = + cfg->udp6_zero_rx_checksums; } else #endif { - err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock); - if (err < 0) - goto out; - - sk_change_net(sock->sk, net); - - udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->local_ip; - udp_addr.sin_port = htons(cfg->local_udp_port); - err = kernel_bind(sock, (struct sockaddr *) &udp_addr, - sizeof(udp_addr)); - if (err < 0) - goto out; - - udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->peer_ip; - udp_addr.sin_port = htons(cfg->peer_udp_port); - err = kernel_connect(sock, - (struct sockaddr *) &udp_addr, - sizeof(udp_addr), 0); - if (err < 0) - goto out; + udp_conf.family = AF_INET; + udp_conf.local_ip = cfg->local_ip; + udp_conf.peer_ip = cfg->peer_ip; + udp_conf.use_udp_checksums = cfg->use_udp_checksums; } - if (!cfg->use_udp_checksums) - sock->sk->sk_no_check_tx = 1; + udp_conf.local_udp_port = htons(cfg->local_udp_port); + udp_conf.peer_udp_port = htons(cfg->peer_udp_port); + + err = udp_sock_create(net, &udp_conf, &sock); + if (err < 0) + goto out; break; case L2TP_ENCAPTYPE_IP: #if IS_ENABLED(CONFIG_IPV6) if (cfg->local_ip6 && cfg->peer_ip6) { + struct sockaddr_l2tpip6 ip6_addr = {0}; + err = sock_create_kern(AF_INET6, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) @@ -1461,6 +1427,8 @@ static int l2tp_tunnel_sock_create(struct net *net, } else #endif { + struct sockaddr_l2tpip ip_addr = {0}; + err = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_L2TP, &sock); if (err < 0) @@ -1614,19 +1582,17 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ tunnel->encap = encap; if (encap == L2TP_ENCAPTYPE_UDP) { - /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ - udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP; - udp_sk(sk)->encap_rcv = l2tp_udp_encap_recv; - udp_sk(sk)->encap_destroy = l2tp_udp_encap_destroy; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == PF_INET6 && !tunnel->v4mapped) - udpv6_encap_enable(); - else -#endif - udp_encap_enable(); - } + struct udp_tunnel_sock_cfg udp_cfg; + + udp_cfg.sk_user_data = tunnel; + udp_cfg.encap_type = UDP_ENCAP_L2TPINUDP; + udp_cfg.encap_rcv = l2tp_udp_encap_recv; + udp_cfg.encap_destroy = l2tp_udp_encap_destroy; - sk->sk_user_data = tunnel; + setup_udp_tunnel_sock(net, sock, &udp_cfg); + } else { + sk->sk_user_data = tunnel; + } /* Hook on the tunnel socket destructor so that we can cleanup * if the tunnel socket goes away. diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 76125c57ee6d..edb78e69efe4 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -246,7 +246,8 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p goto out; } - dev = alloc_netdev(sizeof(*priv), name, l2tp_eth_dev_setup); + dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN, + l2tp_eth_dev_setup); if (!dev) { rc = -ENOMEM; goto out_del_session; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index f3f98a156cee..0edb263cc002 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -687,7 +687,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk, lsa->l2tp_scope_id = 0; lsa->l2tp_conn_id = 0; if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) - lsa->l2tp_scope_id = IP6CB(skb)->iif; + lsa->l2tp_scope_id = inet6_iif(skb); } if (np->rxopt.all) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 13752d96275e..b704a9356208 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -755,7 +755,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, /* If PMTU discovery was enabled, use the MTU that was discovered */ dst = sk_dst_get(tunnel->sock); if (dst != NULL) { - u32 pmtu = dst_mtu(__sk_dst_get(tunnel->sock)); + u32 pmtu = dst_mtu(dst); + if (pmtu != 0) session->mtu = session->mru = pmtu - PPPOL2TP_HEADER_OVERHEAD; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 0080d2b0a8ae..bb9cbc17d926 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -839,7 +839,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, if (!(flags & MSG_PEEK)) { spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); *seq = 0; } @@ -861,10 +861,10 @@ copy_uaddr: llc_cmsg_rcv(msg, skb); if (!(flags & MSG_PEEK)) { - spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); - sk_eat_skb(sk, skb, false); - spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); - *seq = 0; + spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); + sk_eat_skb(sk, skb); + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); + *seq = 0; } goto out; diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index 97b5dcad5025..aeb6a483b3bc 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -19,14 +19,6 @@ if MAC80211 != n config MAC80211_HAS_RC bool -config MAC80211_RC_PID - bool "PID controller based rate control algorithm" if EXPERT - select MAC80211_HAS_RC - ---help--- - This option enables a TX rate control algorithm for - mac80211 that uses a PID controller to select the TX - rate. - config MAC80211_RC_MINSTREL bool "Minstrel" if EXPERT select MAC80211_HAS_RC @@ -51,14 +43,6 @@ choice overridden through the ieee80211_default_rc_algo module parameter if different algorithms are available. -config MAC80211_RC_DEFAULT_PID - bool "PID controller based rate control algorithm" - depends on MAC80211_RC_PID - ---help--- - Select the PID controller based rate control as the - default rate control algorithm. You should choose - this unless you know what you are doing. - config MAC80211_RC_DEFAULT_MINSTREL bool "Minstrel" depends on MAC80211_RC_MINSTREL @@ -72,7 +56,6 @@ config MAC80211_RC_DEFAULT string default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL && MAC80211_RC_MINSTREL_HT default "minstrel" if MAC80211_RC_DEFAULT_MINSTREL - default "pid" if MAC80211_RC_DEFAULT_PID default "" endif diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 1e46ffa69167..7273d2796dd1 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -17,6 +17,7 @@ mac80211-y := \ aes_ccm.o \ aes_cmac.o \ cfg.o \ + ethtool.o \ rx.o \ spectmgmt.o \ tx.o \ @@ -47,17 +48,12 @@ mac80211-$(CONFIG_PM) += pm.o CFLAGS_trace.o := -I$(src) -# objects for PID algorithm -rc80211_pid-y := rc80211_pid_algo.o -rc80211_pid-$(CONFIG_MAC80211_DEBUGFS) += rc80211_pid_debugfs.o - rc80211_minstrel-y := rc80211_minstrel.o rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_debugfs.o rc80211_minstrel_ht-y := rc80211_minstrel_ht.o rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o -mac80211-$(CONFIG_MAC80211_RC_PID) += $(rc80211_pid-y) mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y) mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y) diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 31bf2586fb84..a48bad468880 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -52,7 +52,7 @@ static void ieee80211_free_tid_rx(struct rcu_head *h) del_timer_sync(&tid_rx->reorder_timer); for (i = 0; i < tid_rx->buf_size; i++) - dev_kfree_skb(tid_rx->reorder_buf[i]); + __skb_queue_purge(&tid_rx->reorder_buf[i]); kfree(tid_rx->reorder_buf); kfree(tid_rx->reorder_time); kfree(tid_rx); @@ -224,28 +224,15 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d ieee80211_tx_skb(sdata, skb); } -void ieee80211_process_addba_request(struct ieee80211_local *local, - struct sta_info *sta, - struct ieee80211_mgmt *mgmt, - size_t len) +void __ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, u16 tid, + u16 buf_size, bool tx, bool auto_seq) { + struct ieee80211_local *local = sta->sdata->local; struct tid_ampdu_rx *tid_agg_rx; - u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num, status; - u8 dialog_token; - int ret = -EOPNOTSUPP; - - /* extract session parameters from addba request frame */ - dialog_token = mgmt->u.action.u.addba_req.dialog_token; - timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout); - start_seq_num = - le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4; - - capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); - ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1; - tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; - buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; - - status = WLAN_STATUS_REQUEST_DECLINED; + int i, ret = -EOPNOTSUPP; + u16 status = WLAN_STATUS_REQUEST_DECLINED; if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) { ht_dbg(sta->sdata, @@ -264,7 +251,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, status = WLAN_STATUS_INVALID_QOS_PARAM; ht_dbg_ratelimited(sta->sdata, "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n", - mgmt->sa, tid, ba_policy, buf_size); + sta->sta.addr, tid, ba_policy, buf_size); goto end_no_lock; } /* determine default buffer size */ @@ -281,7 +268,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, if (sta->ampdu_mlme.tid_rx[tid]) { ht_dbg_ratelimited(sta->sdata, "unexpected AddBA Req from %pM on tid %u\n", - mgmt->sa, tid); + sta->sta.addr, tid); /* delete existing Rx BA session on the same tid */ ___ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, @@ -308,7 +295,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, /* prepare reordering buffer */ tid_agg_rx->reorder_buf = - kcalloc(buf_size, sizeof(struct sk_buff *), GFP_KERNEL); + kcalloc(buf_size, sizeof(struct sk_buff_head), GFP_KERNEL); tid_agg_rx->reorder_time = kcalloc(buf_size, sizeof(unsigned long), GFP_KERNEL); if (!tid_agg_rx->reorder_buf || !tid_agg_rx->reorder_time) { @@ -318,6 +305,9 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, goto end; } + for (i = 0; i < buf_size; i++) + __skb_queue_head_init(&tid_agg_rx->reorder_buf[i]); + ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START, &sta->sta, tid, &start_seq_num, 0); ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n", @@ -336,6 +326,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, tid_agg_rx->buf_size = buf_size; tid_agg_rx->timeout = timeout; tid_agg_rx->stored_mpdu_num = 0; + tid_agg_rx->auto_seq = auto_seq; status = WLAN_STATUS_SUCCESS; /* activate it for RX */ @@ -350,6 +341,74 @@ end: mutex_unlock(&sta->ampdu_mlme.mtx); end_no_lock: - ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid, - dialog_token, status, 1, buf_size, timeout); + if (tx) + ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid, + dialog_token, status, 1, buf_size, + timeout); +} + +void ieee80211_process_addba_request(struct ieee80211_local *local, + struct sta_info *sta, + struct ieee80211_mgmt *mgmt, + size_t len) +{ + u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num; + u8 dialog_token; + + /* extract session parameters from addba request frame */ + dialog_token = mgmt->u.action.u.addba_req.dialog_token; + timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout); + start_seq_num = + le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4; + + capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1; + tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; + buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; + + __ieee80211_start_rx_ba_session(sta, dialog_token, timeout, + start_seq_num, ba_policy, tid, + buf_size, true, false); +} + +void ieee80211_start_rx_ba_session_offl(struct ieee80211_vif *vif, + const u8 *addr, u16 tid) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct ieee80211_rx_agg *rx_agg; + struct sk_buff *skb = dev_alloc_skb(0); + + if (unlikely(!skb)) + return; + + rx_agg = (struct ieee80211_rx_agg *) &skb->cb; + memcpy(&rx_agg->addr, addr, ETH_ALEN); + rx_agg->tid = tid; + + skb->pkt_type = IEEE80211_SDATA_QUEUE_RX_AGG_START; + skb_queue_tail(&sdata->skb_queue, skb); + ieee80211_queue_work(&local->hw, &sdata->work); +} +EXPORT_SYMBOL(ieee80211_start_rx_ba_session_offl); + +void ieee80211_stop_rx_ba_session_offl(struct ieee80211_vif *vif, + const u8 *addr, u16 tid) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct ieee80211_rx_agg *rx_agg; + struct sk_buff *skb = dev_alloc_skb(0); + + if (unlikely(!skb)) + return; + + rx_agg = (struct ieee80211_rx_agg *) &skb->cb; + memcpy(&rx_agg->addr, addr, ETH_ALEN); + rx_agg->tid = tid; + + skb->pkt_type = IEEE80211_SDATA_QUEUE_RX_AGG_STOP; + skb_queue_tail(&sdata->skb_queue, skb); + ieee80211_queue_work(&local->hw, &sdata->work); } +EXPORT_SYMBOL(ieee80211_stop_rx_ba_session_offl); diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index ce9633a3cfb0..d6986f3aa5c4 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -170,10 +170,13 @@ ieee80211_stop_queue_agg(struct ieee80211_sub_if_data *sdata, int tid) { int queue = sdata->vif.hw_queue[ieee80211_ac_from_tid(tid)]; + /* we do refcounting here, so don't use the queue reason refcounting */ + if (atomic_inc_return(&sdata->local->agg_queue_stop[queue]) == 1) ieee80211_stop_queue_by_reason( &sdata->local->hw, queue, - IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + IEEE80211_QUEUE_STOP_REASON_AGGREGATION, + false); __acquire(agg_queue); } @@ -185,7 +188,8 @@ ieee80211_wake_queue_agg(struct ieee80211_sub_if_data *sdata, int tid) if (atomic_dec_return(&sdata->local->agg_queue_stop[queue]) == 0) ieee80211_wake_queue_by_reason( &sdata->local->hw, queue, - IEEE80211_QUEUE_STOP_REASON_AGGREGATION); + IEEE80211_QUEUE_STOP_REASON_AGGREGATION, + false); __release(agg_queue); } diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 592f4b152ba8..fb6a1502b6df 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2,6 +2,7 @@ * mac80211 configuration hooks for cfg80211 * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This file is GPLv2 as found in COPYING. */ @@ -468,330 +469,6 @@ void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) rinfo->flags |= RATE_INFO_FLAGS_160_MHZ_WIDTH; } -static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) -{ - struct ieee80211_sub_if_data *sdata = sta->sdata; - struct ieee80211_local *local = sdata->local; - struct rate_control_ref *ref = NULL; - struct timespec uptime; - u64 packets = 0; - u32 thr = 0; - int i, ac; - - if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) - ref = local->rate_ctrl; - - sinfo->generation = sdata->local->sta_generation; - - sinfo->filled = STATION_INFO_INACTIVE_TIME | - STATION_INFO_RX_BYTES64 | - STATION_INFO_TX_BYTES64 | - STATION_INFO_RX_PACKETS | - STATION_INFO_TX_PACKETS | - STATION_INFO_TX_RETRIES | - STATION_INFO_TX_FAILED | - STATION_INFO_TX_BITRATE | - STATION_INFO_RX_BITRATE | - STATION_INFO_RX_DROP_MISC | - STATION_INFO_BSS_PARAM | - STATION_INFO_CONNECTED_TIME | - STATION_INFO_STA_FLAGS | - STATION_INFO_BEACON_LOSS_COUNT; - - do_posix_clock_monotonic_gettime(&uptime); - sinfo->connected_time = uptime.tv_sec - sta->last_connected; - - sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); - sinfo->tx_bytes = 0; - for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { - sinfo->tx_bytes += sta->tx_bytes[ac]; - packets += sta->tx_packets[ac]; - } - sinfo->tx_packets = packets; - sinfo->rx_bytes = sta->rx_bytes; - sinfo->rx_packets = sta->rx_packets; - sinfo->tx_retries = sta->tx_retry_count; - sinfo->tx_failed = sta->tx_retry_failed; - sinfo->rx_dropped_misc = sta->rx_dropped; - sinfo->beacon_loss_count = sta->beacon_loss_count; - - if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) || - (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) { - sinfo->filled |= STATION_INFO_SIGNAL | STATION_INFO_SIGNAL_AVG; - if (!local->ops->get_rssi || - drv_get_rssi(local, sdata, &sta->sta, &sinfo->signal)) - sinfo->signal = (s8)sta->last_signal; - sinfo->signal_avg = (s8) -ewma_read(&sta->avg_signal); - } - if (sta->chains) { - sinfo->filled |= STATION_INFO_CHAIN_SIGNAL | - STATION_INFO_CHAIN_SIGNAL_AVG; - - sinfo->chains = sta->chains; - for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { - sinfo->chain_signal[i] = sta->chain_signal_last[i]; - sinfo->chain_signal_avg[i] = - (s8) -ewma_read(&sta->chain_signal_avg[i]); - } - } - - sta_set_rate_info_tx(sta, &sta->last_tx_rate, &sinfo->txrate); - sta_set_rate_info_rx(sta, &sinfo->rxrate); - - if (ieee80211_vif_is_mesh(&sdata->vif)) { -#ifdef CONFIG_MAC80211_MESH - sinfo->filled |= STATION_INFO_LLID | - STATION_INFO_PLID | - STATION_INFO_PLINK_STATE | - STATION_INFO_LOCAL_PM | - STATION_INFO_PEER_PM | - STATION_INFO_NONPEER_PM; - - sinfo->llid = sta->llid; - sinfo->plid = sta->plid; - sinfo->plink_state = sta->plink_state; - if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { - sinfo->filled |= STATION_INFO_T_OFFSET; - sinfo->t_offset = sta->t_offset; - } - sinfo->local_pm = sta->local_pm; - sinfo->peer_pm = sta->peer_pm; - sinfo->nonpeer_pm = sta->nonpeer_pm; -#endif - } - - sinfo->bss_param.flags = 0; - if (sdata->vif.bss_conf.use_cts_prot) - sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; - if (sdata->vif.bss_conf.use_short_preamble) - sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; - if (sdata->vif.bss_conf.use_short_slot) - sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; - sinfo->bss_param.dtim_period = sdata->local->hw.conf.ps_dtim_period; - sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int; - - sinfo->sta_flags.set = 0; - sinfo->sta_flags.mask = BIT(NL80211_STA_FLAG_AUTHORIZED) | - BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | - BIT(NL80211_STA_FLAG_WME) | - BIT(NL80211_STA_FLAG_MFP) | - BIT(NL80211_STA_FLAG_AUTHENTICATED) | - BIT(NL80211_STA_FLAG_ASSOCIATED) | - BIT(NL80211_STA_FLAG_TDLS_PEER); - if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) - sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); - if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE)) - sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE); - if (test_sta_flag(sta, WLAN_STA_WME)) - sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME); - if (test_sta_flag(sta, WLAN_STA_MFP)) - sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); - if (test_sta_flag(sta, WLAN_STA_AUTH)) - sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED); - if (test_sta_flag(sta, WLAN_STA_ASSOC)) - sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED); - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) - sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); - - /* check if the driver has a SW RC implementation */ - if (ref && ref->ops->get_expected_throughput) - thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); - else - thr = drv_get_expected_throughput(local, &sta->sta); - - if (thr != 0) { - sinfo->filled |= STATION_INFO_EXPECTED_THROUGHPUT; - sinfo->expected_throughput = thr; - } -} - -static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { - "rx_packets", "rx_bytes", "wep_weak_iv_count", - "rx_duplicates", "rx_fragments", "rx_dropped", - "tx_packets", "tx_bytes", "tx_fragments", - "tx_filtered", "tx_retry_failed", "tx_retries", - "beacon_loss", "sta_state", "txrate", "rxrate", "signal", - "channel", "noise", "ch_time", "ch_time_busy", - "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" -}; -#define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats) - -static int ieee80211_get_et_sset_count(struct wiphy *wiphy, - struct net_device *dev, - int sset) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - int rv = 0; - - if (sset == ETH_SS_STATS) - rv += STA_STATS_LEN; - - rv += drv_get_et_sset_count(sdata, sset); - - if (rv == 0) - return -EOPNOTSUPP; - return rv; -} - -static void ieee80211_get_et_stats(struct wiphy *wiphy, - struct net_device *dev, - struct ethtool_stats *stats, - u64 *data) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - struct ieee80211_chanctx_conf *chanctx_conf; - struct ieee80211_channel *channel; - struct sta_info *sta; - struct ieee80211_local *local = sdata->local; - struct station_info sinfo; - struct survey_info survey; - int i, q; -#define STA_STATS_SURVEY_LEN 7 - - memset(data, 0, sizeof(u64) * STA_STATS_LEN); - -#define ADD_STA_STATS(sta) \ - do { \ - data[i++] += sta->rx_packets; \ - data[i++] += sta->rx_bytes; \ - data[i++] += sta->wep_weak_iv_count; \ - data[i++] += sta->num_duplicates; \ - data[i++] += sta->rx_fragments; \ - data[i++] += sta->rx_dropped; \ - \ - data[i++] += sinfo.tx_packets; \ - data[i++] += sinfo.tx_bytes; \ - data[i++] += sta->tx_fragments; \ - data[i++] += sta->tx_filtered_count; \ - data[i++] += sta->tx_retry_failed; \ - data[i++] += sta->tx_retry_count; \ - data[i++] += sta->beacon_loss_count; \ - } while (0) - - /* For Managed stations, find the single station based on BSSID - * and use that. For interface types, iterate through all available - * stations and add stats for any station that is assigned to this - * network device. - */ - - mutex_lock(&local->sta_mtx); - - if (sdata->vif.type == NL80211_IFTYPE_STATION) { - sta = sta_info_get_bss(sdata, sdata->u.mgd.bssid); - - if (!(sta && !WARN_ON(sta->sdata->dev != dev))) - goto do_survey; - - sinfo.filled = 0; - sta_set_sinfo(sta, &sinfo); - - i = 0; - ADD_STA_STATS(sta); - - data[i++] = sta->sta_state; - - - if (sinfo.filled & STATION_INFO_TX_BITRATE) - data[i] = 100000 * - cfg80211_calculate_bitrate(&sinfo.txrate); - i++; - if (sinfo.filled & STATION_INFO_RX_BITRATE) - data[i] = 100000 * - cfg80211_calculate_bitrate(&sinfo.rxrate); - i++; - - if (sinfo.filled & STATION_INFO_SIGNAL_AVG) - data[i] = (u8)sinfo.signal_avg; - i++; - } else { - list_for_each_entry(sta, &local->sta_list, list) { - /* Make sure this station belongs to the proper dev */ - if (sta->sdata->dev != dev) - continue; - - sinfo.filled = 0; - sta_set_sinfo(sta, &sinfo); - i = 0; - ADD_STA_STATS(sta); - } - } - -do_survey: - i = STA_STATS_LEN - STA_STATS_SURVEY_LEN; - /* Get survey stats for current channel */ - survey.filled = 0; - - rcu_read_lock(); - chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - if (chanctx_conf) - channel = chanctx_conf->def.chan; - else - channel = NULL; - rcu_read_unlock(); - - if (channel) { - q = 0; - do { - survey.filled = 0; - if (drv_get_survey(local, q, &survey) != 0) { - survey.filled = 0; - break; - } - q++; - } while (channel != survey.channel); - } - - if (survey.filled) - data[i++] = survey.channel->center_freq; - else - data[i++] = 0; - if (survey.filled & SURVEY_INFO_NOISE_DBM) - data[i++] = (u8)survey.noise; - else - data[i++] = -1LL; - if (survey.filled & SURVEY_INFO_CHANNEL_TIME) - data[i++] = survey.channel_time; - else - data[i++] = -1LL; - if (survey.filled & SURVEY_INFO_CHANNEL_TIME_BUSY) - data[i++] = survey.channel_time_busy; - else - data[i++] = -1LL; - if (survey.filled & SURVEY_INFO_CHANNEL_TIME_EXT_BUSY) - data[i++] = survey.channel_time_ext_busy; - else - data[i++] = -1LL; - if (survey.filled & SURVEY_INFO_CHANNEL_TIME_RX) - data[i++] = survey.channel_time_rx; - else - data[i++] = -1LL; - if (survey.filled & SURVEY_INFO_CHANNEL_TIME_TX) - data[i++] = survey.channel_time_tx; - else - data[i++] = -1LL; - - mutex_unlock(&local->sta_mtx); - - if (WARN_ON(i != STA_STATS_LEN)) - return; - - drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN])); -} - -static void ieee80211_get_et_strings(struct wiphy *wiphy, - struct net_device *dev, - u32 sset, u8 *data) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - int sz_sta_stats = 0; - - if (sset == ETH_SS_STATS) { - sz_sta_stats = sizeof(ieee80211_gstrings_sta_stats); - memcpy(data, ieee80211_gstrings_sta_stats, sz_sta_stats); - } - drv_get_et_strings(sdata, sset, &(data[sz_sta_stats])); -} - static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { @@ -878,7 +555,8 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, } static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, - const u8 *resp, size_t resp_len) + const u8 *resp, size_t resp_len, + const struct ieee80211_csa_settings *csa) { struct probe_resp *new, *old; @@ -894,6 +572,11 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, new->len = resp_len; memcpy(new->data, resp, resp_len); + if (csa) + memcpy(new->csa_counter_offsets, csa->counter_offsets_presp, + csa->n_counter_offsets_presp * + sizeof(new->csa_counter_offsets[0])); + rcu_assign_pointer(sdata->u.ap.probe_resp, new); if (old) kfree_rcu(old, rcu_head); @@ -902,7 +585,8 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, } static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, - struct cfg80211_beacon_data *params) + struct cfg80211_beacon_data *params, + const struct ieee80211_csa_settings *csa) { struct beacon_data *new, *old; int new_head_len, new_tail_len; @@ -946,6 +630,13 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, new->head_len = new_head_len; new->tail_len = new_tail_len; + if (csa) { + new->csa_current_counter = csa->count; + memcpy(new->csa_counter_offsets, csa->counter_offsets_beacon, + csa->n_counter_offsets_beacon * + sizeof(new->csa_counter_offsets[0])); + } + /* copy in head */ if (params->head) memcpy(new->head, params->head, new_head_len); @@ -960,7 +651,7 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, memcpy(new->tail, old->tail, new_tail_len); err = ieee80211_set_probe_resp(sdata, params->probe_resp, - params->probe_resp_len); + params->probe_resp_len, csa); if (err < 0) return err; if (err == 0) @@ -992,8 +683,19 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (old) return -EALREADY; - /* TODO: make hostapd tell us what it wants */ - sdata->smps_mode = IEEE80211_SMPS_OFF; + switch (params->smps_mode) { + case NL80211_SMPS_OFF: + sdata->smps_mode = IEEE80211_SMPS_OFF; + break; + case NL80211_SMPS_STATIC: + sdata->smps_mode = IEEE80211_SMPS_STATIC; + break; + case NL80211_SMPS_DYNAMIC: + sdata->smps_mode = IEEE80211_SMPS_DYNAMIC; + break; + default: + return -EINVAL; + } sdata->needed_rx_chains = sdata->local->rx_chains; mutex_lock(&local->mtx); @@ -1045,7 +747,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; - err = ieee80211_assign_beacon(sdata, ¶ms->beacon); + err = ieee80211_assign_beacon(sdata, ¶ms->beacon, NULL); if (err < 0) { ieee80211_vif_release_channel(sdata); return err; @@ -1093,38 +795,13 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, if (!old) return -ENOENT; - err = ieee80211_assign_beacon(sdata, params); + err = ieee80211_assign_beacon(sdata, params, NULL); if (err < 0) return err; ieee80211_bss_info_change_notify(sdata, err); return 0; } -bool ieee80211_csa_needs_block_tx(struct ieee80211_local *local) -{ - struct ieee80211_sub_if_data *sdata; - - lockdep_assert_held(&local->mtx); - - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata)) - continue; - - if (!sdata->vif.csa_active) - continue; - - if (!sdata->csa_block_tx) - continue; - - rcu_read_unlock(); - return true; - } - rcu_read_unlock(); - - return false; -} - static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -1144,10 +821,12 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) /* abort any running channel switch */ mutex_lock(&local->mtx); sdata->vif.csa_active = false; - if (!ieee80211_csa_needs_block_tx(local)) - ieee80211_wake_queues_by_reason(&local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); + if (sdata->csa_block_tx) { + ieee80211_wake_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_CSA); + sdata->csa_block_tx = false; + } + mutex_unlock(&local->mtx); kfree(sdata->u.ap.next_beacon); @@ -1330,9 +1009,12 @@ static int sta_apply_parameters(struct ieee80211_local *local, } } - ret = sta_apply_auth_flags(local, sta, mask, set); - if (ret) - return ret; + /* auth flags will be set later for TDLS stations */ + if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + ret = sta_apply_auth_flags(local, sta, mask, set); + if (ret) + return ret; + } if (mask & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) { if (set & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) @@ -1341,15 +1023,8 @@ static int sta_apply_parameters(struct ieee80211_local *local, clear_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE); } - if (mask & BIT(NL80211_STA_FLAG_WME)) { - if (set & BIT(NL80211_STA_FLAG_WME)) { - set_sta_flag(sta, WLAN_STA_WME); - sta->sta.wme = true; - } else { - clear_sta_flag(sta, WLAN_STA_WME); - sta->sta.wme = false; - } - } + if (mask & BIT(NL80211_STA_FLAG_WME)) + sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); if (mask & BIT(NL80211_STA_FLAG_MFP)) { if (set & BIT(NL80211_STA_FLAG_MFP)) @@ -1469,6 +1144,13 @@ static int sta_apply_parameters(struct ieee80211_local *local, #endif } + /* set the STA state after all sta info from usermode has been set */ + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + ret = sta_apply_auth_flags(local, sta, mask, set); + if (ret) + return ret; + } + return 0; } @@ -2307,8 +1989,13 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) return err; } - if (changed & WIPHY_PARAM_COVERAGE_CLASS) { - err = drv_set_coverage_class(local, wiphy->coverage_class); + if ((changed & WIPHY_PARAM_COVERAGE_CLASS) || + (changed & WIPHY_PARAM_DYN_ACK)) { + s16 coverage_class; + + coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ? + wiphy->coverage_class : -1; + err = drv_set_coverage_class(local, coverage_class); if (err) return err; @@ -2681,6 +2368,58 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, return 0; } +static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local, + struct ieee80211_roc_work *new_roc, + struct ieee80211_roc_work *cur_roc) +{ + unsigned long j = jiffies; + unsigned long cur_roc_end = cur_roc->hw_start_time + + msecs_to_jiffies(cur_roc->duration); + struct ieee80211_roc_work *next_roc; + int new_dur; + + if (WARN_ON(!cur_roc->started || !cur_roc->hw_begun)) + return false; + + if (time_after(j + IEEE80211_ROC_MIN_LEFT, cur_roc_end)) + return false; + + ieee80211_handle_roc_started(new_roc); + + new_dur = new_roc->duration - jiffies_to_msecs(cur_roc_end - j); + + /* cur_roc is long enough - add new_roc to the dependents list. */ + if (new_dur <= 0) { + list_add_tail(&new_roc->list, &cur_roc->dependents); + return true; + } + + new_roc->duration = new_dur; + + /* + * if cur_roc was already coalesced before, we might + * want to extend the next roc instead of adding + * a new one. + */ + next_roc = list_entry(cur_roc->list.next, + struct ieee80211_roc_work, list); + if (&next_roc->list != &local->roc_list && + next_roc->chan == new_roc->chan && + next_roc->sdata == new_roc->sdata && + !WARN_ON(next_roc->started)) { + list_add_tail(&new_roc->list, &next_roc->dependents); + next_roc->duration = max(next_roc->duration, + new_roc->duration); + next_roc->type = max(next_roc->type, new_roc->type); + return true; + } + + /* add right after cur_roc */ + list_add(&new_roc->list, &cur_roc->list); + + return true; +} + static int ieee80211_start_roc_work(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, @@ -2786,8 +2525,6 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, /* If it has already started, it's more difficult ... */ if (local->ops->remain_on_channel) { - unsigned long j = jiffies; - /* * In the offloaded ROC case, if it hasn't begun, add * this new one to the dependent list to be handled @@ -2810,28 +2547,8 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, break; } - if (time_before(j + IEEE80211_ROC_MIN_LEFT, - tmp->hw_start_time + - msecs_to_jiffies(tmp->duration))) { - int new_dur; - - ieee80211_handle_roc_started(roc); - - new_dur = roc->duration - - jiffies_to_msecs(tmp->hw_start_time + - msecs_to_jiffies( - tmp->duration) - - j); - - if (new_dur > 0) { - /* add right after tmp */ - list_add(&roc->list, &tmp->list); - } else { - list_add_tail(&roc->list, - &tmp->dependents); - } + if (ieee80211_coalesce_started_roc(local, roc, tmp)) queued = true; - } } else if (del_timer_sync(&tmp->work.timer)) { unsigned long new_end; @@ -3076,7 +2793,8 @@ static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP: - err = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon); + err = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon, + NULL); kfree(sdata->u.ap.next_beacon); sdata->u.ap.next_beacon = NULL; @@ -3114,17 +2832,35 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) sdata_assert_lock(sdata); lockdep_assert_held(&local->mtx); + lockdep_assert_held(&local->chanctx_mtx); - sdata->radar_required = sdata->csa_radar_required; - err = ieee80211_vif_change_channel(sdata, &changed); - if (err < 0) - return err; + /* + * using reservation isn't immediate as it may be deferred until later + * with multi-vif. once reservation is complete it will re-schedule the + * work with no reserved_chanctx so verify chandef to check if it + * completed successfully + */ - if (!local->use_chanctx) { - local->_oper_chandef = sdata->csa_chandef; - ieee80211_hw_config(local, 0); + if (sdata->reserved_chanctx) { + /* + * with multi-vif csa driver may call ieee80211_csa_finish() + * many times while waiting for other interfaces to use their + * reservations + */ + if (sdata->reserved_ready) + return 0; + + err = ieee80211_vif_use_reserved_context(sdata); + if (err) + return err; + + return 0; } + if (!cfg80211_chandef_identical(&sdata->vif.bss_conf.chandef, + &sdata->csa_chandef)) + return -EINVAL; + sdata->vif.csa_active = false; err = ieee80211_set_after_csa_beacon(sdata, &changed); @@ -3134,10 +2870,11 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) ieee80211_bss_info_change_notify(sdata, changed); cfg80211_ch_switch_notify(sdata->dev, &sdata->csa_chandef); - if (!ieee80211_csa_needs_block_tx(local)) - ieee80211_wake_queues_by_reason(&local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); + if (sdata->csa_block_tx) { + ieee80211_wake_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_CSA); + sdata->csa_block_tx = false; + } return 0; } @@ -3160,6 +2897,7 @@ void ieee80211_csa_finalize_work(struct work_struct *work) sdata_lock(sdata); mutex_lock(&local->mtx); + mutex_lock(&local->chanctx_mtx); /* AP might have been stopped while waiting for the lock. */ if (!sdata->vif.csa_active) @@ -3171,6 +2909,7 @@ void ieee80211_csa_finalize_work(struct work_struct *work) ieee80211_csa_finalize(sdata); unlock: + mutex_unlock(&local->chanctx_mtx); mutex_unlock(&local->mtx); sdata_unlock(sdata); } @@ -3179,6 +2918,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *params, u32 *changed) { + struct ieee80211_csa_settings csa = {}; int err; switch (sdata->vif.type) { @@ -3213,20 +2953,13 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, IEEE80211_MAX_CSA_COUNTERS_NUM)) return -EINVAL; - /* make sure we don't have garbage in other counters */ - memset(sdata->csa_counter_offset_beacon, 0, - sizeof(sdata->csa_counter_offset_beacon)); - memset(sdata->csa_counter_offset_presp, 0, - sizeof(sdata->csa_counter_offset_presp)); + csa.counter_offsets_beacon = params->counter_offsets_beacon; + csa.counter_offsets_presp = params->counter_offsets_presp; + csa.n_counter_offsets_beacon = params->n_counter_offsets_beacon; + csa.n_counter_offsets_presp = params->n_counter_offsets_presp; + csa.count = params->count; - memcpy(sdata->csa_counter_offset_beacon, - params->counter_offsets_beacon, - params->n_counter_offsets_beacon * sizeof(u16)); - memcpy(sdata->csa_counter_offset_presp, - params->counter_offsets_presp, - params->n_counter_offsets_presp * sizeof(u16)); - - err = ieee80211_assign_beacon(sdata, ¶ms->beacon_csa); + err = ieee80211_assign_beacon(sdata, ¶ms->beacon_csa, &csa); if (err < 0) { kfree(sdata->u.ap.next_beacon); return err; @@ -3322,7 +3055,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *chanctx; - int err, num_chanctx, changed = 0; + int err, changed = 0; sdata_assert_lock(sdata); lockdep_assert_held(&local->mtx); @@ -3337,46 +3070,50 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, &sdata->vif.bss_conf.chandef)) return -EINVAL; + /* don't allow another channel switch if one is already active. */ + if (sdata->vif.csa_active) + return -EBUSY; + mutex_lock(&local->chanctx_mtx); conf = rcu_dereference_protected(sdata->vif.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); if (!conf) { - mutex_unlock(&local->chanctx_mtx); - return -EBUSY; + err = -EBUSY; + goto out; } - /* don't handle for multi-VIF cases */ chanctx = container_of(conf, struct ieee80211_chanctx, conf); - if (ieee80211_chanctx_refcount(local, chanctx) > 1) { - mutex_unlock(&local->chanctx_mtx); - return -EBUSY; + if (!chanctx) { + err = -EBUSY; + goto out; } - num_chanctx = 0; - list_for_each_entry_rcu(chanctx, &local->chanctx_list, list) - num_chanctx++; - mutex_unlock(&local->chanctx_mtx); - if (num_chanctx > 1) - return -EBUSY; + err = ieee80211_vif_reserve_chanctx(sdata, ¶ms->chandef, + chanctx->mode, + params->radar_required); + if (err) + goto out; - /* don't allow another channel switch if one is already active. */ - if (sdata->vif.csa_active) - return -EBUSY; + /* if reservation is invalid then this will fail */ + err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0); + if (err) { + ieee80211_vif_unreserve_chanctx(sdata); + goto out; + } err = ieee80211_set_csa_beacon(sdata, params, &changed); - if (err) - return err; + if (err) { + ieee80211_vif_unreserve_chanctx(sdata); + goto out; + } - sdata->csa_radar_required = params->radar_required; sdata->csa_chandef = params->chandef; sdata->csa_block_tx = params->block_tx; - sdata->csa_current_counter = params->count; sdata->vif.csa_active = true; if (sdata->csa_block_tx) - ieee80211_stop_queues_by_reason(&local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); + ieee80211_stop_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_CSA); if (changed) { ieee80211_bss_info_change_notify(sdata, changed); @@ -3386,7 +3123,9 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, ieee80211_csa_finalize(sdata); } - return 0; +out: + mutex_unlock(&local->chanctx_mtx); + return err; } int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, @@ -3518,10 +3257,23 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, sdata->vif.type == NL80211_IFTYPE_ADHOC) && params->n_csa_offsets) { int i; - u8 c = sdata->csa_current_counter; + struct beacon_data *beacon = NULL; + + rcu_read_lock(); + + if (sdata->vif.type == NL80211_IFTYPE_AP) + beacon = rcu_dereference(sdata->u.ap.beacon); + else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + beacon = rcu_dereference(sdata->u.ibss.presp); + else if (ieee80211_vif_is_mesh(&sdata->vif)) + beacon = rcu_dereference(sdata->u.mesh.beacon); - for (i = 0; i < params->n_csa_offsets; i++) - data[params->csa_offsets[i]] = c; + if (beacon) + for (i = 0; i < params->n_csa_offsets; i++) + data[params->csa_offsets[i]] = + beacon->csa_current_counter; + + rcu_read_unlock(); } IEEE80211_SKB_CB(skb)->flags = flags; @@ -3601,21 +3353,6 @@ static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) return drv_get_antenna(local, tx_ant, rx_ant); } -static int ieee80211_set_ringparam(struct wiphy *wiphy, u32 tx, u32 rx) -{ - struct ieee80211_local *local = wiphy_priv(wiphy); - - return drv_set_ringparam(local, tx, rx); -} - -static void ieee80211_get_ringparam(struct wiphy *wiphy, - u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max) -{ - struct ieee80211_local *local = wiphy_priv(wiphy); - - drv_get_ringparam(local, tx, tx_max, rx, rx_max); -} - static int ieee80211_set_rekey_data(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_gtk_rekey_data *data) @@ -3655,7 +3392,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, band = chanctx_conf->def.chan->band; sta = sta_info_get_bss(sdata, peer); if (sta) { - qos = test_sta_flag(sta, WLAN_STA_WME); + qos = sta->sta.wme; } else { rcu_read_unlock(); return -ENOLINK; @@ -3847,8 +3584,6 @@ const struct cfg80211_ops mac80211_config_ops = { .mgmt_frame_register = ieee80211_mgmt_frame_register, .set_antenna = ieee80211_set_antenna, .get_antenna = ieee80211_get_antenna, - .set_ringparam = ieee80211_set_ringparam, - .get_ringparam = ieee80211_get_ringparam, .set_rekey_data = ieee80211_set_rekey_data, .tdls_oper = ieee80211_tdls_oper, .tdls_mgmt = ieee80211_tdls_mgmt, @@ -3857,9 +3592,6 @@ const struct cfg80211_ops mac80211_config_ops = { #ifdef CONFIG_PM .set_wakeup = ieee80211_set_wakeup, #endif - .get_et_sset_count = ieee80211_get_et_sset_count, - .get_et_stats = ieee80211_get_et_stats, - .get_et_strings = ieee80211_get_et_strings, .get_channel = ieee80211_cfg_get_channel, .start_radar_detection = ieee80211_start_radar_detection, .channel_switch = ieee80211_channel_switch, diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index a310e33972de..4c74e8da64b9 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -63,6 +63,20 @@ static bool ieee80211_can_create_new_chanctx(struct ieee80211_local *local) return ieee80211_num_chanctx(local) < ieee80211_max_num_channels(local); } +static struct ieee80211_chanctx * +ieee80211_vif_get_chanctx(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local __maybe_unused = sdata->local; + struct ieee80211_chanctx_conf *conf; + + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (!conf) + return NULL; + + return container_of(conf, struct ieee80211_chanctx, conf); +} + static const struct cfg80211_chan_def * ieee80211_chanctx_reserved_chandef(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, @@ -160,6 +174,9 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local, return NULL; list_for_each_entry(ctx, &local->chanctx_list, list) { + if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) + continue; + if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) continue; @@ -347,6 +364,9 @@ ieee80211_find_chanctx(struct ieee80211_local *local, list_for_each_entry(ctx, &local->chanctx_list, list) { const struct cfg80211_chan_def *compat; + if (ctx->replace_state != IEEE80211_CHANCTX_REPLACE_NONE) + continue; + if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) continue; @@ -521,18 +541,20 @@ static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, continue; if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf) continue; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + continue; if (!compat) compat = &sdata->vif.bss_conf.chandef; compat = cfg80211_chandef_compatible( &sdata->vif.bss_conf.chandef, compat); - if (!compat) + if (WARN_ON_ONCE(!compat)) break; } rcu_read_unlock(); - if (WARN_ON_ONCE(!compat)) + if (!compat) return; ieee80211_change_chanctx(local, ctx, compat); @@ -617,29 +639,6 @@ out: return ret; } -static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_chanctx_conf *conf; - struct ieee80211_chanctx *ctx; - - lockdep_assert_held(&local->chanctx_mtx); - - conf = rcu_dereference_protected(sdata->vif.chanctx_conf, - lockdep_is_held(&local->chanctx_mtx)); - if (!conf) - return; - - ctx = container_of(conf, struct ieee80211_chanctx, conf); - - if (sdata->reserved_chanctx) - ieee80211_vif_unreserve_chanctx(sdata); - - ieee80211_assign_vif_chanctx(sdata, NULL); - if (ieee80211_chanctx_refcount(local, ctx) == 0) - ieee80211_free_chanctx(local, ctx); -} - void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx) { @@ -730,127 +729,6 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RX_CHAINS); } -int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, - const struct cfg80211_chan_def *chandef, - enum ieee80211_chanctx_mode mode) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_chanctx *ctx; - u8 radar_detect_width = 0; - int ret; - - lockdep_assert_held(&local->mtx); - - WARN_ON(sdata->dev && netif_carrier_ok(sdata->dev)); - - mutex_lock(&local->chanctx_mtx); - - ret = cfg80211_chandef_dfs_required(local->hw.wiphy, - chandef, - sdata->wdev.iftype); - if (ret < 0) - goto out; - if (ret > 0) - radar_detect_width = BIT(chandef->width); - - sdata->radar_required = ret; - - ret = ieee80211_check_combinations(sdata, chandef, mode, - radar_detect_width); - if (ret < 0) - goto out; - - __ieee80211_vif_release_channel(sdata); - - ctx = ieee80211_find_chanctx(local, chandef, mode); - if (!ctx) - ctx = ieee80211_new_chanctx(local, chandef, mode); - if (IS_ERR(ctx)) { - ret = PTR_ERR(ctx); - goto out; - } - - sdata->vif.bss_conf.chandef = *chandef; - - ret = ieee80211_assign_vif_chanctx(sdata, ctx); - if (ret) { - /* if assign fails refcount stays the same */ - if (ieee80211_chanctx_refcount(local, ctx) == 0) - ieee80211_free_chanctx(local, ctx); - goto out; - } - - ieee80211_recalc_smps_chanctx(local, ctx); - ieee80211_recalc_radar_chanctx(local, ctx); - out: - mutex_unlock(&local->chanctx_mtx); - return ret; -} - -static int __ieee80211_vif_change_channel(struct ieee80211_sub_if_data *sdata, - struct ieee80211_chanctx *ctx, - u32 *changed) -{ - struct ieee80211_local *local = sdata->local; - const struct cfg80211_chan_def *chandef = &sdata->csa_chandef; - u32 chanctx_changed = 0; - - if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, - IEEE80211_CHAN_DISABLED)) - return -EINVAL; - - if (ieee80211_chanctx_refcount(local, ctx) != 1) - return -EINVAL; - - if (sdata->vif.bss_conf.chandef.width != chandef->width) { - chanctx_changed = IEEE80211_CHANCTX_CHANGE_WIDTH; - *changed |= BSS_CHANGED_BANDWIDTH; - } - - sdata->vif.bss_conf.chandef = *chandef; - ctx->conf.def = *chandef; - - chanctx_changed |= IEEE80211_CHANCTX_CHANGE_CHANNEL; - drv_change_chanctx(local, ctx, chanctx_changed); - - ieee80211_recalc_chanctx_chantype(local, ctx); - ieee80211_recalc_smps_chanctx(local, ctx); - ieee80211_recalc_radar_chanctx(local, ctx); - ieee80211_recalc_chanctx_min_def(local, ctx); - - return 0; -} - -int ieee80211_vif_change_channel(struct ieee80211_sub_if_data *sdata, - u32 *changed) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_chanctx_conf *conf; - struct ieee80211_chanctx *ctx; - int ret; - - lockdep_assert_held(&local->mtx); - - /* should never be called if not performing a channel switch. */ - if (WARN_ON(!sdata->vif.csa_active)) - return -EINVAL; - - mutex_lock(&local->chanctx_mtx); - conf = rcu_dereference_protected(sdata->vif.chanctx_conf, - lockdep_is_held(&local->chanctx_mtx)); - if (!conf) { - ret = -EINVAL; - goto out; - } - - ctx = container_of(conf, struct ieee80211_chanctx, conf); - - ret = __ieee80211_vif_change_channel(sdata, ctx, changed); - out: - mutex_unlock(&local->chanctx_mtx); - return ret; -} - static void __ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata, bool clear) @@ -905,8 +783,25 @@ int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata) list_del(&sdata->reserved_chanctx_list); sdata->reserved_chanctx = NULL; - if (ieee80211_chanctx_refcount(sdata->local, ctx) == 0) - ieee80211_free_chanctx(sdata->local, ctx); + if (ieee80211_chanctx_refcount(sdata->local, ctx) == 0) { + if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) { + if (WARN_ON(!ctx->replace_ctx)) + return -EINVAL; + + WARN_ON(ctx->replace_ctx->replace_state != + IEEE80211_CHANCTX_WILL_BE_REPLACED); + WARN_ON(ctx->replace_ctx->replace_ctx != ctx); + + ctx->replace_ctx->replace_ctx = NULL; + ctx->replace_ctx->replace_state = + IEEE80211_CHANCTX_REPLACE_NONE; + + list_del_rcu(&ctx->list); + kfree_rcu(ctx, rcu_head); + } else { + ieee80211_free_chanctx(sdata->local, ctx); + } + } return 0; } @@ -917,40 +812,84 @@ int ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata, bool radar_required) { struct ieee80211_local *local = sdata->local; - struct ieee80211_chanctx_conf *conf; - struct ieee80211_chanctx *new_ctx, *curr_ctx; - int ret = 0; - - mutex_lock(&local->chanctx_mtx); + struct ieee80211_chanctx *new_ctx, *curr_ctx, *ctx; - conf = rcu_dereference_protected(sdata->vif.chanctx_conf, - lockdep_is_held(&local->chanctx_mtx)); - if (!conf) { - ret = -EINVAL; - goto out; - } + lockdep_assert_held(&local->chanctx_mtx); - curr_ctx = container_of(conf, struct ieee80211_chanctx, conf); + curr_ctx = ieee80211_vif_get_chanctx(sdata); + if (curr_ctx && local->use_chanctx && !local->ops->switch_vif_chanctx) + return -ENOTSUPP; new_ctx = ieee80211_find_reservation_chanctx(local, chandef, mode); if (!new_ctx) { - if (ieee80211_chanctx_refcount(local, curr_ctx) == 1 && - (local->hw.flags & IEEE80211_HW_CHANGE_RUNNING_CHANCTX)) { - /* if we're the only users of the chanctx and - * the driver supports changing a running - * context, reserve our current context - */ - new_ctx = curr_ctx; - } else if (ieee80211_can_create_new_chanctx(local)) { - /* create a new context and reserve it */ + if (ieee80211_can_create_new_chanctx(local)) { new_ctx = ieee80211_new_chanctx(local, chandef, mode); - if (IS_ERR(new_ctx)) { - ret = PTR_ERR(new_ctx); - goto out; - } + if (IS_ERR(new_ctx)) + return PTR_ERR(new_ctx); } else { - ret = -EBUSY; - goto out; + if (!curr_ctx || + (curr_ctx->replace_state == + IEEE80211_CHANCTX_WILL_BE_REPLACED) || + !list_empty(&curr_ctx->reserved_vifs)) { + /* + * Another vif already requested this context + * for a reservation. Find another one hoping + * all vifs assigned to it will also switch + * soon enough. + * + * TODO: This needs a little more work as some + * cases (more than 2 chanctx capable devices) + * may fail which could otherwise succeed + * provided some channel context juggling was + * performed. + * + * Consider ctx1..3, vif1..6, each ctx has 2 + * vifs. vif1 and vif2 from ctx1 request new + * different chandefs starting 2 in-place + * reserations with ctx4 and ctx5 replacing + * ctx1 and ctx2 respectively. Next vif5 and + * vif6 from ctx3 reserve ctx4. If vif3 and + * vif4 remain on ctx2 as they are then this + * fails unless `replace_ctx` from ctx5 is + * replaced with ctx3. + */ + list_for_each_entry(ctx, &local->chanctx_list, + list) { + if (ctx->replace_state != + IEEE80211_CHANCTX_REPLACE_NONE) + continue; + + if (!list_empty(&ctx->reserved_vifs)) + continue; + + curr_ctx = ctx; + break; + } + } + + /* + * If that's true then all available contexts already + * have reservations and cannot be used. + */ + if (!curr_ctx || + (curr_ctx->replace_state == + IEEE80211_CHANCTX_WILL_BE_REPLACED) || + !list_empty(&curr_ctx->reserved_vifs)) + return -EBUSY; + + new_ctx = ieee80211_alloc_chanctx(local, chandef, mode); + if (!new_ctx) + return -ENOMEM; + + new_ctx->replace_ctx = curr_ctx; + new_ctx->replace_state = + IEEE80211_CHANCTX_REPLACES_OTHER; + + curr_ctx->replace_ctx = new_ctx; + curr_ctx->replace_state = + IEEE80211_CHANCTX_WILL_BE_REPLACED; + + list_add_rcu(&new_ctx->list, &local->chanctx_list); } } @@ -958,84 +897,694 @@ int ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata, sdata->reserved_chanctx = new_ctx; sdata->reserved_chandef = *chandef; sdata->reserved_radar_required = radar_required; -out: - mutex_unlock(&local->chanctx_mtx); - return ret; + sdata->reserved_ready = false; + + return 0; } -int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata, - u32 *changed) +static void +ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_local *local = sdata->local; - struct ieee80211_chanctx *ctx; - struct ieee80211_chanctx *old_ctx; - struct ieee80211_chanctx_conf *conf; - int ret; - u32 tmp_changed = *changed; + switch (sdata->vif.type) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_MESH_POINT: + ieee80211_queue_work(&sdata->local->hw, + &sdata->csa_finalize_work); + break; + case NL80211_IFTYPE_STATION: + ieee80211_queue_work(&sdata->local->hw, + &sdata->u.mgd.chswitch_work); + break; + case NL80211_IFTYPE_UNSPECIFIED: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_WDS: + case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_P2P_DEVICE: + case NUM_NL80211_IFTYPES: + WARN_ON(1); + break; + } +} - /* TODO: need to recheck if the chandef is usable etc.? */ +static int +ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_vif_chanctx_switch vif_chsw[1] = {}; + struct ieee80211_chanctx *old_ctx, *new_ctx; + const struct cfg80211_chan_def *chandef; + u32 changed = 0; + int err; lockdep_assert_held(&local->mtx); + lockdep_assert_held(&local->chanctx_mtx); - mutex_lock(&local->chanctx_mtx); + new_ctx = sdata->reserved_chanctx; + old_ctx = ieee80211_vif_get_chanctx(sdata); - ctx = sdata->reserved_chanctx; - if (WARN_ON(!ctx)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON(!sdata->reserved_ready)) + return -EBUSY; + + if (WARN_ON(!new_ctx)) + return -EINVAL; + + if (WARN_ON(!old_ctx)) + return -EINVAL; + + if (WARN_ON(new_ctx->replace_state == + IEEE80211_CHANCTX_REPLACES_OTHER)) + return -EINVAL; + + chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx, + &sdata->reserved_chandef); + if (WARN_ON(!chandef)) + return -EINVAL; + + vif_chsw[0].vif = &sdata->vif; + vif_chsw[0].old_ctx = &old_ctx->conf; + vif_chsw[0].new_ctx = &new_ctx->conf; + + list_del(&sdata->reserved_chanctx_list); + sdata->reserved_chanctx = NULL; + + err = drv_switch_vif_chanctx(local, vif_chsw, 1, + CHANCTX_SWMODE_REASSIGN_VIF); + if (err) { + if (ieee80211_chanctx_refcount(local, new_ctx) == 0) + ieee80211_free_chanctx(local, new_ctx); - conf = rcu_dereference_protected(sdata->vif.chanctx_conf, - lockdep_is_held(&local->chanctx_mtx)); - if (!conf) { - ret = -EINVAL; goto out; } - old_ctx = container_of(conf, struct ieee80211_chanctx, conf); + list_move(&sdata->assigned_chanctx_list, &new_ctx->assigned_vifs); + rcu_assign_pointer(sdata->vif.chanctx_conf, &new_ctx->conf); + + if (sdata->vif.type == NL80211_IFTYPE_AP) + __ieee80211_vif_copy_chanctx_to_vlans(sdata, false); + + if (ieee80211_chanctx_refcount(local, old_ctx) == 0) + ieee80211_free_chanctx(local, old_ctx); if (sdata->vif.bss_conf.chandef.width != sdata->reserved_chandef.width) - tmp_changed |= BSS_CHANGED_BANDWIDTH; + changed = BSS_CHANGED_BANDWIDTH; sdata->vif.bss_conf.chandef = sdata->reserved_chandef; - /* unref our reservation */ - sdata->reserved_chanctx = NULL; - sdata->radar_required = sdata->reserved_radar_required; + if (changed) + ieee80211_bss_info_change_notify(sdata, changed); + +out: + ieee80211_vif_chanctx_reservation_complete(sdata); + return err; +} + +static int +ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx *old_ctx, *new_ctx; + const struct cfg80211_chan_def *chandef; + int err; + + old_ctx = ieee80211_vif_get_chanctx(sdata); + new_ctx = sdata->reserved_chanctx; + + if (WARN_ON(!sdata->reserved_ready)) + return -EINVAL; + + if (WARN_ON(old_ctx)) + return -EINVAL; + + if (WARN_ON(!new_ctx)) + return -EINVAL; + + if (WARN_ON(new_ctx->replace_state == + IEEE80211_CHANCTX_REPLACES_OTHER)) + return -EINVAL; + + chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx, + &sdata->reserved_chandef); + if (WARN_ON(!chandef)) + return -EINVAL; + list_del(&sdata->reserved_chanctx_list); + sdata->reserved_chanctx = NULL; - if (old_ctx == ctx) { - /* This is our own context, just change it */ - ret = __ieee80211_vif_change_channel(sdata, old_ctx, - &tmp_changed); - if (ret) + err = ieee80211_assign_vif_chanctx(sdata, new_ctx); + if (err) { + if (ieee80211_chanctx_refcount(local, new_ctx) == 0) + ieee80211_free_chanctx(local, new_ctx); + + goto out; + } + +out: + ieee80211_vif_chanctx_reservation_complete(sdata); + return err; +} + +static bool +ieee80211_vif_has_in_place_reservation(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_chanctx *old_ctx, *new_ctx; + + lockdep_assert_held(&sdata->local->chanctx_mtx); + + new_ctx = sdata->reserved_chanctx; + old_ctx = ieee80211_vif_get_chanctx(sdata); + + if (!old_ctx) + return false; + + if (WARN_ON(!new_ctx)) + return false; + + if (old_ctx->replace_state != IEEE80211_CHANCTX_WILL_BE_REPLACED) + return false; + + if (new_ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) + return false; + + return true; +} + +static int ieee80211_chsw_switch_hwconf(struct ieee80211_local *local, + struct ieee80211_chanctx *new_ctx) +{ + const struct cfg80211_chan_def *chandef; + + lockdep_assert_held(&local->mtx); + lockdep_assert_held(&local->chanctx_mtx); + + chandef = ieee80211_chanctx_reserved_chandef(local, new_ctx, NULL); + if (WARN_ON(!chandef)) + return -EINVAL; + + local->hw.conf.radar_enabled = new_ctx->conf.radar_enabled; + local->_oper_chandef = *chandef; + ieee80211_hw_config(local, 0); + + return 0; +} + +static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local, + int n_vifs) +{ + struct ieee80211_vif_chanctx_switch *vif_chsw; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_chanctx *ctx, *old_ctx; + int i, err; + + lockdep_assert_held(&local->mtx); + lockdep_assert_held(&local->chanctx_mtx); + + vif_chsw = kzalloc(sizeof(vif_chsw[0]) * n_vifs, GFP_KERNEL); + if (!vif_chsw) + return -ENOMEM; + + i = 0; + list_for_each_entry(ctx, &local->chanctx_list, list) { + if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) + continue; + + if (WARN_ON(!ctx->replace_ctx)) { + err = -EINVAL; goto out; + } + + list_for_each_entry(sdata, &ctx->reserved_vifs, + reserved_chanctx_list) { + if (!ieee80211_vif_has_in_place_reservation( + sdata)) + continue; + + old_ctx = ieee80211_vif_get_chanctx(sdata); + vif_chsw[i].vif = &sdata->vif; + vif_chsw[i].old_ctx = &old_ctx->conf; + vif_chsw[i].new_ctx = &ctx->conf; + + i++; + } + } + + err = drv_switch_vif_chanctx(local, vif_chsw, n_vifs, + CHANCTX_SWMODE_SWAP_CONTEXTS); + +out: + kfree(vif_chsw); + return err; +} + +static int ieee80211_chsw_switch_ctxs(struct ieee80211_local *local) +{ + struct ieee80211_chanctx *ctx; + int err; + + lockdep_assert_held(&local->mtx); + lockdep_assert_held(&local->chanctx_mtx); + + list_for_each_entry(ctx, &local->chanctx_list, list) { + if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) + continue; + + if (!list_empty(&ctx->replace_ctx->assigned_vifs)) + continue; + + ieee80211_del_chanctx(local, ctx->replace_ctx); + err = ieee80211_add_chanctx(local, ctx); + if (err) + goto err; + } + + return 0; + +err: + WARN_ON(ieee80211_add_chanctx(local, ctx)); + list_for_each_entry_continue_reverse(ctx, &local->chanctx_list, list) { + if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) + continue; + + if (!list_empty(&ctx->replace_ctx->assigned_vifs)) + continue; + + ieee80211_del_chanctx(local, ctx); + WARN_ON(ieee80211_add_chanctx(local, ctx->replace_ctx)); + } + + return err; +} + +static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) +{ + struct ieee80211_sub_if_data *sdata, *sdata_tmp; + struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx; + struct ieee80211_chanctx *new_ctx = NULL; + int i, err, n_assigned, n_reserved, n_ready; + int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0; + + lockdep_assert_held(&local->mtx); + lockdep_assert_held(&local->chanctx_mtx); + + /* + * If there are 2 independent pairs of channel contexts performing + * cross-switch of their vifs this code will still wait until both are + * ready even though it could be possible to switch one before the + * other is ready. + * + * For practical reasons and code simplicity just do a single huge + * switch. + */ + + /* + * Verify if the reservation is still feasible. + * - if it's not then disconnect + * - if it is but not all vifs necessary are ready then defer + */ + + list_for_each_entry(ctx, &local->chanctx_list, list) { + if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) + continue; + + if (WARN_ON(!ctx->replace_ctx)) { + err = -EINVAL; + goto err; + } + + if (!local->use_chanctx) + new_ctx = ctx; + + n_ctx++; + + n_assigned = 0; + n_reserved = 0; + n_ready = 0; + + list_for_each_entry(sdata, &ctx->replace_ctx->assigned_vifs, + assigned_chanctx_list) { + n_assigned++; + if (sdata->reserved_chanctx) { + n_reserved++; + if (sdata->reserved_ready) + n_ready++; + } + } + + if (n_assigned != n_reserved) { + if (n_ready == n_reserved) { + wiphy_info(local->hw.wiphy, + "channel context reservation cannot be finalized because some interfaces aren't switching\n"); + err = -EBUSY; + goto err; + } + + return -EAGAIN; + } + + ctx->conf.radar_enabled = false; + list_for_each_entry(sdata, &ctx->reserved_vifs, + reserved_chanctx_list) { + if (ieee80211_vif_has_in_place_reservation(sdata) && + !sdata->reserved_ready) + return -EAGAIN; + + old_ctx = ieee80211_vif_get_chanctx(sdata); + if (old_ctx) { + if (old_ctx->replace_state == + IEEE80211_CHANCTX_WILL_BE_REPLACED) + n_vifs_switch++; + else + n_vifs_assign++; + } else { + n_vifs_ctxless++; + } + + if (sdata->reserved_radar_required) + ctx->conf.radar_enabled = true; + } + } + + if (WARN_ON(n_ctx == 0) || + WARN_ON(n_vifs_switch == 0 && + n_vifs_assign == 0 && + n_vifs_ctxless == 0) || + WARN_ON(n_ctx > 1 && !local->use_chanctx) || + WARN_ON(!new_ctx && !local->use_chanctx)) { + err = -EINVAL; + goto err; + } + + /* + * All necessary vifs are ready. Perform the switch now depending on + * reservations and driver capabilities. + */ + + if (local->use_chanctx) { + if (n_vifs_switch > 0) { + err = ieee80211_chsw_switch_vifs(local, n_vifs_switch); + if (err) + goto err; + } + + if (n_vifs_assign > 0 || n_vifs_ctxless > 0) { + err = ieee80211_chsw_switch_ctxs(local); + if (err) + goto err; + } } else { - ret = ieee80211_assign_vif_chanctx(sdata, ctx); - if (ieee80211_chanctx_refcount(local, old_ctx) == 0) - ieee80211_free_chanctx(local, old_ctx); - if (ret) { - /* if assign fails refcount stays the same */ - if (ieee80211_chanctx_refcount(local, ctx) == 0) - ieee80211_free_chanctx(local, ctx); - goto out; + err = ieee80211_chsw_switch_hwconf(local, new_ctx); + if (err) + goto err; + } + + /* + * Update all structures, values and pointers to point to new channel + * context(s). + */ + + i = 0; + list_for_each_entry(ctx, &local->chanctx_list, list) { + if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) + continue; + + if (WARN_ON(!ctx->replace_ctx)) { + err = -EINVAL; + goto err; + } + + list_for_each_entry(sdata, &ctx->reserved_vifs, + reserved_chanctx_list) { + u32 changed = 0; + + if (!ieee80211_vif_has_in_place_reservation(sdata)) + continue; + + rcu_assign_pointer(sdata->vif.chanctx_conf, &ctx->conf); + + if (sdata->vif.type == NL80211_IFTYPE_AP) + __ieee80211_vif_copy_chanctx_to_vlans(sdata, + false); + + sdata->radar_required = sdata->reserved_radar_required; + + if (sdata->vif.bss_conf.chandef.width != + sdata->reserved_chandef.width) + changed = BSS_CHANGED_BANDWIDTH; + + sdata->vif.bss_conf.chandef = sdata->reserved_chandef; + if (changed) + ieee80211_bss_info_change_notify(sdata, + changed); + + ieee80211_recalc_txpower(sdata); + } + + ieee80211_recalc_chanctx_chantype(local, ctx); + ieee80211_recalc_smps_chanctx(local, ctx); + ieee80211_recalc_radar_chanctx(local, ctx); + ieee80211_recalc_chanctx_min_def(local, ctx); + + list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs, + reserved_chanctx_list) { + if (ieee80211_vif_get_chanctx(sdata) != ctx) + continue; + + list_del(&sdata->reserved_chanctx_list); + list_move(&sdata->assigned_chanctx_list, + &ctx->assigned_vifs); + sdata->reserved_chanctx = NULL; + + ieee80211_vif_chanctx_reservation_complete(sdata); } - if (sdata->vif.type == NL80211_IFTYPE_AP) - __ieee80211_vif_copy_chanctx_to_vlans(sdata, false); + /* + * This context might have been a dependency for an already + * ready re-assign reservation interface that was deferred. Do + * not propagate error to the caller though. The in-place + * reservation for originally requested interface has already + * succeeded at this point. + */ + list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs, + reserved_chanctx_list) { + if (WARN_ON(ieee80211_vif_has_in_place_reservation( + sdata))) + continue; + + if (WARN_ON(sdata->reserved_chanctx != ctx)) + continue; + + if (!sdata->reserved_ready) + continue; + + if (ieee80211_vif_get_chanctx(sdata)) + err = ieee80211_vif_use_reserved_reassign( + sdata); + else + err = ieee80211_vif_use_reserved_assign(sdata); + + if (err) { + sdata_info(sdata, + "failed to finalize (re-)assign reservation (err=%d)\n", + err); + ieee80211_vif_unreserve_chanctx(sdata); + cfg80211_stop_iface(local->hw.wiphy, + &sdata->wdev, + GFP_KERNEL); + } + } } - *changed = tmp_changed; + /* + * Finally free old contexts + */ + + list_for_each_entry_safe(ctx, ctx_tmp, &local->chanctx_list, list) { + if (ctx->replace_state != IEEE80211_CHANCTX_WILL_BE_REPLACED) + continue; + + ctx->replace_ctx->replace_ctx = NULL; + ctx->replace_ctx->replace_state = + IEEE80211_CHANCTX_REPLACE_NONE; + + list_del_rcu(&ctx->list); + kfree_rcu(ctx, rcu_head); + } + + return 0; + +err: + list_for_each_entry(ctx, &local->chanctx_list, list) { + if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) + continue; + + list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs, + reserved_chanctx_list) { + ieee80211_vif_unreserve_chanctx(sdata); + ieee80211_vif_chanctx_reservation_complete(sdata); + } + } + + return err; +} + +static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *ctx; + bool use_reserved_switch = false; + + lockdep_assert_held(&local->chanctx_mtx); + + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (!conf) + return; + + ctx = container_of(conf, struct ieee80211_chanctx, conf); + + if (sdata->reserved_chanctx) { + if (sdata->reserved_chanctx->replace_state == + IEEE80211_CHANCTX_REPLACES_OTHER && + ieee80211_chanctx_num_reserved(local, + sdata->reserved_chanctx) > 1) + use_reserved_switch = true; + + ieee80211_vif_unreserve_chanctx(sdata); + } + + ieee80211_assign_vif_chanctx(sdata, NULL); + if (ieee80211_chanctx_refcount(local, ctx) == 0) + ieee80211_free_chanctx(local, ctx); + + /* Unreserving may ready an in-place reservation. */ + if (use_reserved_switch) + ieee80211_vif_use_reserved_switch(local); +} + +int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_chan_def *chandef, + enum ieee80211_chanctx_mode mode) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx *ctx; + u8 radar_detect_width = 0; + int ret; + + lockdep_assert_held(&local->mtx); + + WARN_ON(sdata->dev && netif_carrier_ok(sdata->dev)); + + mutex_lock(&local->chanctx_mtx); + + ret = cfg80211_chandef_dfs_required(local->hw.wiphy, + chandef, + sdata->wdev.iftype); + if (ret < 0) + goto out; + if (ret > 0) + radar_detect_width = BIT(chandef->width); + + sdata->radar_required = ret; + + ret = ieee80211_check_combinations(sdata, chandef, mode, + radar_detect_width); + if (ret < 0) + goto out; + + __ieee80211_vif_release_channel(sdata); + + ctx = ieee80211_find_chanctx(local, chandef, mode); + if (!ctx) + ctx = ieee80211_new_chanctx(local, chandef, mode); + if (IS_ERR(ctx)) { + ret = PTR_ERR(ctx); + goto out; + } + + sdata->vif.bss_conf.chandef = *chandef; + + ret = ieee80211_assign_vif_chanctx(sdata, ctx); + if (ret) { + /* if assign fails refcount stays the same */ + if (ieee80211_chanctx_refcount(local, ctx) == 0) + ieee80211_free_chanctx(local, ctx); + goto out; + } - ieee80211_recalc_chanctx_chantype(local, ctx); ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); - ieee80211_recalc_chanctx_min_def(local, ctx); -out: + out: mutex_unlock(&local->chanctx_mtx); return ret; } +int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx *new_ctx; + struct ieee80211_chanctx *old_ctx; + int err; + + lockdep_assert_held(&local->mtx); + lockdep_assert_held(&local->chanctx_mtx); + + new_ctx = sdata->reserved_chanctx; + old_ctx = ieee80211_vif_get_chanctx(sdata); + + if (WARN_ON(!new_ctx)) + return -EINVAL; + + if (WARN_ON(new_ctx->replace_state == + IEEE80211_CHANCTX_WILL_BE_REPLACED)) + return -EINVAL; + + if (WARN_ON(sdata->reserved_ready)) + return -EINVAL; + + sdata->reserved_ready = true; + + if (new_ctx->replace_state == IEEE80211_CHANCTX_REPLACE_NONE) { + if (old_ctx) + err = ieee80211_vif_use_reserved_reassign(sdata); + else + err = ieee80211_vif_use_reserved_assign(sdata); + + if (err) + return err; + } + + /* + * In-place reservation may need to be finalized now either if: + * a) sdata is taking part in the swapping itself and is the last one + * b) sdata has switched with a re-assign reservation to an existing + * context readying in-place switching of old_ctx + * + * In case of (b) do not propagate the error up because the requested + * sdata already switched successfully. Just spill an extra warning. + * The ieee80211_vif_use_reserved_switch() already stops all necessary + * interfaces upon failure. + */ + if ((old_ctx && + old_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) || + new_ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) { + err = ieee80211_vif_use_reserved_switch(local); + if (err && err != -EAGAIN) { + if (new_ctx->replace_state == + IEEE80211_CHANCTX_REPLACES_OTHER) + return err; + + wiphy_info(local->hw.wiphy, + "depending in-place reservation failed (err=%d)\n", + err); + } + } + + return 0; +} + int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, u32 *changed) @@ -1043,6 +1592,7 @@ int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; + const struct cfg80211_chan_def *compat; int ret; if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, @@ -1069,11 +1619,33 @@ int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata, } ctx = container_of(conf, struct ieee80211_chanctx, conf); - if (!cfg80211_chandef_compatible(&conf->def, chandef)) { + + compat = cfg80211_chandef_compatible(&conf->def, chandef); + if (!compat) { ret = -EINVAL; goto out; } + switch (ctx->replace_state) { + case IEEE80211_CHANCTX_REPLACE_NONE: + if (!ieee80211_chanctx_reserved_chandef(local, ctx, compat)) { + ret = -EBUSY; + goto out; + } + break; + case IEEE80211_CHANCTX_WILL_BE_REPLACED: + /* TODO: Perhaps the bandwith change could be treated as a + * reservation itself? */ + ret = -EBUSY; + goto out; + case IEEE80211_CHANCTX_REPLACES_OTHER: + /* channel context that is going to replace another channel + * context doesn't really exist and shouldn't be assigned + * anywhere yet */ + WARN_ON(1); + break; + } + sdata->vif.bss_conf.chandef = *chandef; ieee80211_recalc_chanctx_chantype(local, ctx); diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 0e963bc1ceac..54a189f0393e 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -3,6 +3,7 @@ * mac80211 debugfs for wireless PHYs * * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * GPLv2 * @@ -302,11 +303,6 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_PS\n"); if (local->hw.flags & IEEE80211_HW_MFP_CAPABLE) sf += scnprintf(buf + sf, mxln - sf, "MFP_CAPABLE\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS) - sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_STATIC_SMPS\n"); - if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) - sf += scnprintf(buf + sf, mxln - sf, - "SUPPORTS_DYNAMIC_SMPS\n"); if (local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_UAPSD\n"); if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index e205ebabfa50..c68896adfa96 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -226,12 +226,12 @@ static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; int err; - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS) && + if (!(local->hw.wiphy->features & NL80211_FEATURE_STATIC_SMPS) && smps_mode == IEEE80211_SMPS_STATIC) return -EINVAL; /* auto should be dynamic if in PS mode */ - if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) && + if (!(local->hw.wiphy->features & NL80211_FEATURE_DYNAMIC_SMPS) && (smps_mode == IEEE80211_SMPS_DYNAMIC || smps_mode == IEEE80211_SMPS_AUTOMATIC)) return -EINVAL; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 2ecb4deddb5d..bafe48916229 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -2,6 +2,7 @@ * Copyright 2003-2005 Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -77,7 +78,8 @@ static ssize_t sta_flags_read(struct file *file, char __user *userbuf, TEST(AUTH), TEST(ASSOC), TEST(PS_STA), TEST(PS_DRIVER), TEST(AUTHORIZED), TEST(SHORT_PREAMBLE), - TEST(WME), TEST(WDS), TEST(CLEAR_PS_FILT), + sta->sta.wme ? "WME\n" : "", + TEST(WDS), TEST(CLEAR_PS_FILT), TEST(MFP), TEST(BLOCK_BA), TEST(PSPOLL), TEST(UAPSD), TEST(SP), TEST(TDLS_PEER), TEST(TDLS_PEER_AUTH), TEST(4ADDR_EVENT), @@ -124,7 +126,7 @@ static ssize_t sta_connected_time_read(struct file *file, char __user *userbuf, long connected_time_secs; char buf[100]; int res; - do_posix_clock_monotonic_gettime(&uptime); + ktime_get_ts(&uptime); connected_time_secs = uptime.tv_sec - sta->last_connected; time_to_tm(connected_time_secs, 0, &result); result.tm_year -= 70; @@ -167,7 +169,7 @@ static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, p += scnprintf(p, sizeof(buf) + buf - p, "next dialog_token: %#02x\n", sta->ampdu_mlme.dialog_token_allocator + 1); p += scnprintf(p, sizeof(buf) + buf - p, - "TID\t\tRX active\tDTKN\tSSN\t\tTX\tDTKN\tpending\n"); + "TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n"); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]); @@ -587,7 +589,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DEBUGFS_ADD_COUNTER(tx_filtered, tx_filtered_count); DEBUGFS_ADD_COUNTER(tx_retry_failed, tx_retry_failed); DEBUGFS_ADD_COUNTER(tx_retry_count, tx_retry_count); - DEBUGFS_ADD_COUNTER(wep_weak_iv_count, wep_weak_iv_count); if (sizeof(sta->driver_buffered_tids) == sizeof(u32)) debugfs_create_x32("driver_buffered_tids", 0400, diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index bd782dcffcc7..196d48c68134 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -314,7 +314,7 @@ static inline void drv_update_tkip_key(struct ieee80211_local *local, static inline int drv_hw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - struct cfg80211_scan_request *req) + struct ieee80211_scan_request *req) { int ret; @@ -346,7 +346,7 @@ static inline int drv_sched_scan_start(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req, - struct ieee80211_sched_scan_ies *ies) + struct ieee80211_scan_ies *ies) { int ret; @@ -450,7 +450,7 @@ static inline int drv_set_rts_threshold(struct ieee80211_local *local, } static inline int drv_set_coverage_class(struct ieee80211_local *local, - u8 value) + s16 value) { int ret = 0; might_sleep(); @@ -970,6 +970,22 @@ static inline void drv_mgd_prepare_tx(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline void +drv_mgd_protect_tdls_discover(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); + + trace_drv_mgd_protect_tdls_discover(local, sdata); + if (local->ops->mgd_protect_tdls_discover) + local->ops->mgd_protect_tdls_discover(&local->hw, &sdata->vif); + trace_drv_return_void(local); +} + static inline int drv_add_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c new file mode 100644 index 000000000000..ebfc8091557b --- /dev/null +++ b/net/mac80211/ethtool.c @@ -0,0 +1,244 @@ +/* + * mac80211 ethtool hooks for cfg80211 + * + * Copied from cfg.c - originally + * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2014 Intel Corporation (Author: Johannes Berg) + * + * This file is GPLv2 as found in COPYING. + */ +#include <linux/types.h> +#include <net/cfg80211.h> +#include "ieee80211_i.h" +#include "sta_info.h" +#include "driver-ops.h" + +static int ieee80211_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *rp) +{ + struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); + + if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) + return -EINVAL; + + return drv_set_ringparam(local, rp->tx_pending, rp->rx_pending); +} + +static void ieee80211_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *rp) +{ + struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); + + memset(rp, 0, sizeof(*rp)); + + drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending, + &rp->rx_pending, &rp->rx_max_pending); +} + +static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { + "rx_packets", "rx_bytes", + "rx_duplicates", "rx_fragments", "rx_dropped", + "tx_packets", "tx_bytes", "tx_fragments", + "tx_filtered", "tx_retry_failed", "tx_retries", + "beacon_loss", "sta_state", "txrate", "rxrate", "signal", + "channel", "noise", "ch_time", "ch_time_busy", + "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" +}; +#define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats) + +static int ieee80211_get_sset_count(struct net_device *dev, int sset) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + int rv = 0; + + if (sset == ETH_SS_STATS) + rv += STA_STATS_LEN; + + rv += drv_get_et_sset_count(sdata, sset); + + if (rv == 0) + return -EOPNOTSUPP; + return rv; +} + +static void ieee80211_get_stats(struct net_device *dev, + struct ethtool_stats *stats, + u64 *data) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_chanctx_conf *chanctx_conf; + struct ieee80211_channel *channel; + struct sta_info *sta; + struct ieee80211_local *local = sdata->local; + struct station_info sinfo; + struct survey_info survey; + int i, q; +#define STA_STATS_SURVEY_LEN 7 + + memset(data, 0, sizeof(u64) * STA_STATS_LEN); + +#define ADD_STA_STATS(sta) \ + do { \ + data[i++] += sta->rx_packets; \ + data[i++] += sta->rx_bytes; \ + data[i++] += sta->num_duplicates; \ + data[i++] += sta->rx_fragments; \ + data[i++] += sta->rx_dropped; \ + \ + data[i++] += sinfo.tx_packets; \ + data[i++] += sinfo.tx_bytes; \ + data[i++] += sta->tx_fragments; \ + data[i++] += sta->tx_filtered_count; \ + data[i++] += sta->tx_retry_failed; \ + data[i++] += sta->tx_retry_count; \ + data[i++] += sta->beacon_loss_count; \ + } while (0) + + /* For Managed stations, find the single station based on BSSID + * and use that. For interface types, iterate through all available + * stations and add stats for any station that is assigned to this + * network device. + */ + + mutex_lock(&local->sta_mtx); + + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + sta = sta_info_get_bss(sdata, sdata->u.mgd.bssid); + + if (!(sta && !WARN_ON(sta->sdata->dev != dev))) + goto do_survey; + + sinfo.filled = 0; + sta_set_sinfo(sta, &sinfo); + + i = 0; + ADD_STA_STATS(sta); + + data[i++] = sta->sta_state; + + + if (sinfo.filled & STATION_INFO_TX_BITRATE) + data[i] = 100000 * + cfg80211_calculate_bitrate(&sinfo.txrate); + i++; + if (sinfo.filled & STATION_INFO_RX_BITRATE) + data[i] = 100000 * + cfg80211_calculate_bitrate(&sinfo.rxrate); + i++; + + if (sinfo.filled & STATION_INFO_SIGNAL_AVG) + data[i] = (u8)sinfo.signal_avg; + i++; + } else { + list_for_each_entry(sta, &local->sta_list, list) { + /* Make sure this station belongs to the proper dev */ + if (sta->sdata->dev != dev) + continue; + + sinfo.filled = 0; + sta_set_sinfo(sta, &sinfo); + i = 0; + ADD_STA_STATS(sta); + } + } + +do_survey: + i = STA_STATS_LEN - STA_STATS_SURVEY_LEN; + /* Get survey stats for current channel */ + survey.filled = 0; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (chanctx_conf) + channel = chanctx_conf->def.chan; + else + channel = NULL; + rcu_read_unlock(); + + if (channel) { + q = 0; + do { + survey.filled = 0; + if (drv_get_survey(local, q, &survey) != 0) { + survey.filled = 0; + break; + } + q++; + } while (channel != survey.channel); + } + + if (survey.filled) + data[i++] = survey.channel->center_freq; + else + data[i++] = 0; + if (survey.filled & SURVEY_INFO_NOISE_DBM) + data[i++] = (u8)survey.noise; + else + data[i++] = -1LL; + if (survey.filled & SURVEY_INFO_CHANNEL_TIME) + data[i++] = survey.channel_time; + else + data[i++] = -1LL; + if (survey.filled & SURVEY_INFO_CHANNEL_TIME_BUSY) + data[i++] = survey.channel_time_busy; + else + data[i++] = -1LL; + if (survey.filled & SURVEY_INFO_CHANNEL_TIME_EXT_BUSY) + data[i++] = survey.channel_time_ext_busy; + else + data[i++] = -1LL; + if (survey.filled & SURVEY_INFO_CHANNEL_TIME_RX) + data[i++] = survey.channel_time_rx; + else + data[i++] = -1LL; + if (survey.filled & SURVEY_INFO_CHANNEL_TIME_TX) + data[i++] = survey.channel_time_tx; + else + data[i++] = -1LL; + + mutex_unlock(&local->sta_mtx); + + if (WARN_ON(i != STA_STATS_LEN)) + return; + + drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN])); +} + +static void ieee80211_get_strings(struct net_device *dev, u32 sset, u8 *data) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + int sz_sta_stats = 0; + + if (sset == ETH_SS_STATS) { + sz_sta_stats = sizeof(ieee80211_gstrings_sta_stats); + memcpy(data, ieee80211_gstrings_sta_stats, sz_sta_stats); + } + drv_get_et_strings(sdata, sset, &(data[sz_sta_stats])); +} + +static int ieee80211_get_regs_len(struct net_device *dev) +{ + return 0; +} + +static void ieee80211_get_regs(struct net_device *dev, + struct ethtool_regs *regs, + void *data) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + + regs->version = wdev->wiphy->hw_version; + regs->len = 0; +} + +const struct ethtool_ops ieee80211_ethtool_ops = { + .get_drvinfo = cfg80211_get_drvinfo, + .get_regs_len = ieee80211_get_regs_len, + .get_regs = ieee80211_get_regs, + .get_link = ethtool_op_get_link, + .get_ringparam = ieee80211_get_ringparam, + .set_ringparam = ieee80211_set_ringparam, + .get_strings = ieee80211_get_strings, + .get_ethtool_stats = ieee80211_get_stats, + .get_sset_count = ieee80211_get_sset_count, +}; diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 15702ff64a4c..ff630be2ca75 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -150,13 +150,12 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, /* * If user has specified capability over-rides, take care - * of that if the station we're setting up is the AP that + * of that if the station we're setting up is the AP or TDLS peer that * we advertised a restricted capability set to. Override * our own capabilities and then use those below. */ - if ((sdata->vif.type == NL80211_IFTYPE_STATION || - sdata->vif.type == NL80211_IFTYPE_ADHOC) && - !test_sta_flag(sta, WLAN_STA_TDLS_PEER)) + if (sdata->vif.type == NL80211_IFTYPE_STATION || + sdata->vif.type == NL80211_IFTYPE_ADHOC) ieee80211_apply_htcap_overrides(sdata, &own_cap); /* @@ -228,6 +227,9 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, if (own_cap.mcs.rx_mask[32/8] & ht_cap_ie->mcs.rx_mask[32/8] & 1) ht_cap.mcs.rx_mask[32/8] |= 1; + /* set Rx highest rate */ + ht_cap.mcs.rx_highest = ht_cap_ie->mcs.rx_highest; + apply: changed = memcmp(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap)); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 18ee0a256b1e..56b53571c807 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -6,6 +6,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -143,7 +144,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, *pos++ = csa_settings->block_tx ? 1 : 0; *pos++ = ieee80211_frequency_to_channel( csa_settings->chandef.chan->center_freq); - sdata->csa_counter_offset_beacon[0] = (pos - presp->head); + presp->csa_counter_offsets[0] = (pos - presp->head); *pos++ = csa_settings->count; } @@ -189,17 +190,8 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, chandef, 0); } - if (local->hw.queues >= IEEE80211_NUM_ACS) { - *pos++ = WLAN_EID_VENDOR_SPECIFIC; - *pos++ = 7; /* len */ - *pos++ = 0x00; /* Microsoft OUI 00:50:F2 */ - *pos++ = 0x50; - *pos++ = 0xf2; - *pos++ = 2; /* WME */ - *pos++ = 0; /* WME info */ - *pos++ = 1; /* WME ver */ - *pos++ = 0; /* U-APSD no in use */ - } + if (local->hw.queues >= IEEE80211_NUM_ACS) + pos = ieee80211_add_wmm_info_ie(pos, 0); /* U-APSD not in use */ presp->head_len = pos - presp->head; if (WARN_ON(presp->head_len > frame_len)) @@ -1047,7 +1039,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, } if (sta && elems->wmm_info) - set_sta_flag(sta, WLAN_STA_WME); + sta->sta.wme = true; if (sta && elems->ht_operation && elems->ht_cap_elem && sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT && diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ac9836e0aab3..c2aaec4dfcf0 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -3,6 +3,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -229,16 +230,29 @@ struct ieee80211_rx_data { u16 tkip_iv16; }; +struct ieee80211_csa_settings { + const u16 *counter_offsets_beacon; + const u16 *counter_offsets_presp; + + int n_counter_offsets_beacon; + int n_counter_offsets_presp; + + u8 count; +}; + struct beacon_data { u8 *head, *tail; int head_len, tail_len; struct ieee80211_meshconf_ie *meshconf; + u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM]; + u8 csa_current_counter; struct rcu_head rcu_head; }; struct probe_resp { struct rcu_head rcu_head; int len; + u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM]; u8 data[0]; }; @@ -332,7 +346,6 @@ enum ieee80211_sta_flags { IEEE80211_STA_CONNECTION_POLL = BIT(1), IEEE80211_STA_CONTROL_PORT = BIT(2), IEEE80211_STA_DISABLE_HT = BIT(4), - IEEE80211_STA_CSA_RECEIVED = BIT(5), IEEE80211_STA_MFP_ENABLED = BIT(6), IEEE80211_STA_UAPSD_ENABLED = BIT(7), IEEE80211_STA_NULLFUNC_ACKED = BIT(8), @@ -342,6 +355,7 @@ enum ieee80211_sta_flags { IEEE80211_STA_DISABLE_80P80MHZ = BIT(12), IEEE80211_STA_DISABLE_160MHZ = BIT(13), IEEE80211_STA_DISABLE_WMM = BIT(14), + IEEE80211_STA_ENABLE_RRM = BIT(15), }; struct ieee80211_mgd_auth_data { @@ -490,6 +504,9 @@ struct ieee80211_if_managed { struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */ struct ieee80211_vht_cap vht_capa; /* configured VHT overrides */ struct ieee80211_vht_cap vht_capa_mask; /* Valid parts of vht_capa */ + + u8 tdls_peer[ETH_ALEN] __aligned(2); + struct delayed_work tdls_peer_del_work; }; struct ieee80211_if_ibss { @@ -688,6 +705,24 @@ enum ieee80211_chanctx_mode { IEEE80211_CHANCTX_EXCLUSIVE }; +/** + * enum ieee80211_chanctx_replace_state - channel context replacement state + * + * This is used for channel context in-place reservations that require channel + * context switch/swap. + * + * @IEEE80211_CHANCTX_REPLACE_NONE: no replacement is taking place + * @IEEE80211_CHANCTX_WILL_BE_REPLACED: this channel context will be replaced + * by a (not yet registered) channel context pointed by %replace_ctx. + * @IEEE80211_CHANCTX_REPLACES_OTHER: this (not yet registered) channel context + * replaces an existing channel context pointed to by %replace_ctx. + */ +enum ieee80211_chanctx_replace_state { + IEEE80211_CHANCTX_REPLACE_NONE, + IEEE80211_CHANCTX_WILL_BE_REPLACED, + IEEE80211_CHANCTX_REPLACES_OTHER, +}; + struct ieee80211_chanctx { struct list_head list; struct rcu_head rcu_head; @@ -695,6 +730,9 @@ struct ieee80211_chanctx { struct list_head assigned_vifs; struct list_head reserved_vifs; + enum ieee80211_chanctx_replace_state replace_state; + struct ieee80211_chanctx *replace_ctx; + enum ieee80211_chanctx_mode mode; bool driver_present; @@ -754,9 +792,6 @@ struct ieee80211_sub_if_data { struct mac80211_qos_map __rcu *qos_map; struct work_struct csa_finalize_work; - u16 csa_counter_offset_beacon[IEEE80211_MAX_CSA_COUNTERS_NUM]; - u16 csa_counter_offset_presp[IEEE80211_MAX_CSA_COUNTERS_NUM]; - bool csa_radar_required; bool csa_block_tx; /* write-protected by sdata_lock and local->mtx */ struct cfg80211_chan_def csa_chandef; @@ -767,7 +802,7 @@ struct ieee80211_sub_if_data { struct ieee80211_chanctx *reserved_chanctx; struct cfg80211_chan_def reserved_chandef; bool reserved_radar_required; - u8 csa_current_counter; + bool reserved_ready; /* used to reconfigure hardware SM PS */ struct work_struct recalc_smps; @@ -892,10 +927,17 @@ ieee80211_vif_get_shift(struct ieee80211_vif *vif) return shift; } +struct ieee80211_rx_agg { + u8 addr[ETH_ALEN]; + u16 tid; +}; + enum sdata_queue_type { IEEE80211_SDATA_QUEUE_TYPE_FRAME = 0, IEEE80211_SDATA_QUEUE_AGG_START = 1, IEEE80211_SDATA_QUEUE_AGG_STOP = 2, + IEEE80211_SDATA_QUEUE_RX_AGG_START = 3, + IEEE80211_SDATA_QUEUE_RX_AGG_STOP = 4, }; enum { @@ -912,6 +954,9 @@ enum queue_stop_reason { IEEE80211_QUEUE_STOP_REASON_SKB_ADD, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, IEEE80211_QUEUE_STOP_REASON_FLUSH, + IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN, + + IEEE80211_QUEUE_STOP_REASONS, }; #ifdef CONFIG_MAC80211_LEDS @@ -1008,6 +1053,7 @@ struct ieee80211_local { struct workqueue_struct *workqueue; unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES]; + int q_stop_reasons[IEEE80211_MAX_QUEUES][IEEE80211_QUEUE_STOP_REASONS]; /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */ spinlock_t queue_stop_reason_lock; @@ -1135,7 +1181,8 @@ struct ieee80211_local { unsigned long scanning; struct cfg80211_ssid scan_ssid; struct cfg80211_scan_request *int_scan_req; - struct cfg80211_scan_request *scan_req, *hw_scan_req; + struct cfg80211_scan_request *scan_req; + struct ieee80211_scan_request *hw_scan_req; struct cfg80211_chan_def scan_chandef; enum ieee80211_band hw_scan_band; int scan_channel_idx; @@ -1322,6 +1369,7 @@ struct ieee802_11_elems { const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; const u8 *country_elem; const u8 *pwr_constr_elem; + const u8 *cisco_dtpc_elem; const struct ieee80211_timeout_interval_ie *timeout_int; const u8 *opmode_notif; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; @@ -1476,7 +1524,6 @@ void ieee80211_sw_roc_work(struct work_struct *work); void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc); /* channel switch handling */ -bool ieee80211_csa_needs_block_tx(struct ieee80211_local *local); void ieee80211_csa_finalize_work(struct work_struct *work); int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params); @@ -1540,6 +1587,10 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason, bool stop); void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason, bool stop); +void __ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, u16 tid, + u16 buf_size, bool tx, bool auto_seq); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, @@ -1692,6 +1743,21 @@ static inline void ieee802_11_parse_elems(const u8 *start, size_t len, ieee802_11_parse_elems_crc(start, len, action, elems, 0, 0); } +static inline bool ieee80211_rx_reorder_ready(struct sk_buff_head *frames) +{ + struct sk_buff *tail = skb_peek_tail(frames); + struct ieee80211_rx_status *status; + + if (!tail) + return false; + + status = IEEE80211_SKB_RXCB(tail); + if (status->flag & RX_FLAG_AMSDU_MORE) + return false; + + return true; +} + void ieee80211_dynamic_ps_enable_work(struct work_struct *work); void ieee80211_dynamic_ps_disable_work(struct work_struct *work); void ieee80211_dynamic_ps_timer(unsigned long data); @@ -1705,14 +1771,24 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, - enum queue_stop_reason reason); + enum queue_stop_reason reason, + bool refcounted); +void ieee80211_stop_vif_queues(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum queue_stop_reason reason); +void ieee80211_wake_vif_queues(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum queue_stop_reason reason); void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, - enum queue_stop_reason reason); + enum queue_stop_reason reason, + bool refcounted); void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, - enum queue_stop_reason reason); + enum queue_stop_reason reason, + bool refcounted); void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, - enum queue_stop_reason reason); + enum queue_stop_reason reason, + bool refcounted); void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue); void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb); @@ -1730,8 +1806,10 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf); int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, - size_t buffer_len, const u8 *ie, size_t ie_len, - enum ieee80211_band band, u32 rate_mask, + size_t buffer_len, + struct ieee80211_scan_ies *ie_desc, + const u8 *ie, size_t ie_len, + u8 bands_used, u32 *rate_masks, struct cfg80211_chan_def *chandef); struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, u32 ratemask, @@ -1774,6 +1852,7 @@ int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, enum ieee80211_band band); +u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); /* channel management */ void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan, @@ -1791,18 +1870,13 @@ ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata, enum ieee80211_chanctx_mode mode, bool radar_required); int __must_check -ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata, - u32 *changed); +ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata); int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata); int __must_check ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, u32 *changed); -/* NOTE: only use ieee80211_vif_change_channel() for channel switch */ -int __must_check -ieee80211_vif_change_channel(struct ieee80211_sub_if_data *sdata, - u32 *changed); void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata); void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata, @@ -1842,10 +1916,13 @@ int ieee80211_max_num_channels(struct ieee80211_local *local); int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, - const u8 *extra_ies, size_t extra_ies_len); + bool initiator, const u8 *extra_ies, + size_t extra_ies_len); int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper); +void ieee80211_tdls_peer_del_work(struct work_struct *wk); +extern const struct ethtool_ops ieee80211_ethtool_ops; #ifdef CONFIG_MAC80211_NOINLINE #define debug_noinline noinline diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 388b863e821c..af237223a8cd 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -5,6 +5,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -841,10 +842,11 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, sdata_lock(sdata); mutex_lock(&local->mtx); sdata->vif.csa_active = false; - if (!ieee80211_csa_needs_block_tx(local)) - ieee80211_wake_queues_by_reason(&local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); + if (sdata->csa_block_tx) { + ieee80211_wake_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_CSA); + sdata->csa_block_tx = false; + } mutex_unlock(&local->mtx); sdata_unlock(sdata); @@ -1139,6 +1141,7 @@ static void ieee80211_iface_work(struct work_struct *work) struct sk_buff *skb; struct sta_info *sta; struct ieee80211_ra_tid *ra_tid; + struct ieee80211_rx_agg *rx_agg; if (!ieee80211_sdata_running(sdata)) return; @@ -1166,6 +1169,26 @@ static void ieee80211_iface_work(struct work_struct *work) ra_tid = (void *)&skb->cb; ieee80211_stop_tx_ba_cb(&sdata->vif, ra_tid->ra, ra_tid->tid); + } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_RX_AGG_START) { + rx_agg = (void *)&skb->cb; + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, rx_agg->addr); + if (sta) + __ieee80211_start_rx_ba_session(sta, + 0, 0, 0, 1, rx_agg->tid, + IEEE80211_MAX_AMPDU_BUF, + false, true); + mutex_unlock(&local->sta_mtx); + } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_RX_AGG_STOP) { + rx_agg = (void *)&skb->cb; + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, rx_agg->addr); + if (sta) + __ieee80211_stop_rx_ba_session(sta, + rx_agg->tid, + WLAN_BACK_RECIPIENT, 0, + false); + mutex_unlock(&local->sta_mtx); } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_BACK) { int len = skb->len; @@ -1623,9 +1646,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, if (local->hw.queues >= IEEE80211_NUM_ACS) txqs = IEEE80211_NUM_ACS; - ndev = alloc_netdev_mqs(sizeof(*sdata) + - local->hw.vif_data_size, - name, ieee80211_if_setup, txqs, 1); + ndev = alloc_netdev_mqs(sizeof(*sdata) + local->hw.vif_data_size, + name, NET_NAME_UNKNOWN, + ieee80211_if_setup, txqs, 1); if (!ndev) return -ENOMEM; dev_net_set(ndev, wiphy_net(local->hw.wiphy)); @@ -1705,6 +1728,8 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ndev->features |= local->hw.netdev_features; + netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops); + ret = register_netdevice(ndev); if (ret) { free_netdev(ndev); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 16d97f044a20..4712150dc210 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -130,9 +131,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) if (!ret) { key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || - (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) || - (key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))) + if (!(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) sdata->crypto_tx_tailroom_needed_cnt--; WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && @@ -180,9 +179,7 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) sta = key->sta; sdata = key->sdata; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || - (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) || - (key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))) + if (!(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) increment_tailroom_need_count(sdata); ret = drv_set_key(key->local, DISABLE_KEY, sdata, @@ -425,7 +422,7 @@ static void ieee80211_key_free_common(struct ieee80211_key *key) ieee80211_aes_key_free(key->u.ccmp.tfm); if (key->conf.cipher == WLAN_CIPHER_SUITE_AES_CMAC) ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm); - kfree(key); + kzfree(key); } static void __ieee80211_key_destroy(struct ieee80211_key *key, @@ -482,9 +479,6 @@ int ieee80211_key_link(struct ieee80211_key *key, int idx, ret; bool pairwise; - if (WARN_ON(!sdata || !key)) - return -EINVAL; - pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; idx = key->conf.keyidx; key->local = sdata->local; @@ -881,9 +875,7 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf) if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; - if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) || - (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV) || - (key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))) + if (!(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) increment_tailroom_need_count(key->sdata); } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index d17c26d6e369..0de7c93bf62b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -2,6 +2,7 @@ * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -272,7 +273,8 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw) /* use this reason, ieee80211_reconfig will unblock it */ ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_SUSPEND); + IEEE80211_QUEUE_STOP_REASON_SUSPEND, + false); /* * Stop all Rx during the reconfig. We don't want state changes @@ -1187,18 +1189,12 @@ static int __init ieee80211_init(void) if (ret) goto err_minstrel; - ret = rc80211_pid_init(); - if (ret) - goto err_pid; - ret = ieee80211_iface_init(); if (ret) goto err_netdev; return 0; err_netdev: - rc80211_pid_exit(); - err_pid: rc80211_minstrel_ht_exit(); err_minstrel: rc80211_minstrel_exit(); @@ -1208,7 +1204,6 @@ static int __init ieee80211_init(void) static void __exit ieee80211_exit(void) { - rc80211_pid_exit(); rc80211_minstrel_ht_exit(); rc80211_minstrel_exit(); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 6495a3f0428d..e9f99c1e3fad 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -679,7 +679,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) *pos++ = 0x0; *pos++ = ieee80211_frequency_to_channel( csa->settings.chandef.chan->center_freq); - sdata->csa_counter_offset_beacon[0] = hdr_len + 6; + bcn->csa_counter_offsets[0] = hdr_len + 6; *pos++ = csa->settings.count; *pos++ = WLAN_EID_CHAN_SWITCH_PARAM; *pos++ = 6; @@ -1122,7 +1122,7 @@ static int mesh_fwd_csa_frame(struct ieee80211_sub_if_data *sdata, mgmt_fwd = (struct ieee80211_mgmt *) skb_put(skb, len); /* offset_ttl is based on whether the secondary channel - * offset is available or not. Substract 1 from the mesh TTL + * offset is available or not. Subtract 1 from the mesh TTL * and disable the initiator flag before forwarding. */ offset_ttl = (len < 42) ? 7 : 10; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 94758b9c9ed4..214e63b84e5c 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -157,7 +157,6 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, default: kfree_skb(skb); return -ENOTSUPP; - break; } *pos++ = ie_len; *pos++ = flags; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index cf032a8db9d7..a6699dceae7c 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -729,7 +729,7 @@ void mesh_plink_broken(struct sta_info *sta) tbl = rcu_dereference(mesh_paths); for_each_mesh_entry(tbl, node, i) { mpath = node->mpath; - if (rcu_dereference(mpath->next_hop) == sta && + if (rcu_access_pointer(mpath->next_hop) == sta && mpath->flags & MESH_PATH_ACTIVE && !(mpath->flags & MESH_PATH_FIXED)) { spin_lock_bh(&mpath->state_lock); @@ -794,7 +794,7 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta) tbl = resize_dereference_mesh_paths(); for_each_mesh_entry(tbl, node, i) { mpath = node->mpath; - if (rcu_dereference(mpath->next_hop) == sta) { + if (rcu_access_pointer(mpath->next_hop) == sta) { spin_lock(&tbl->hashwlock[i]); __mesh_path_del(tbl, node); spin_unlock(&tbl->hashwlock[i]); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index e8f60aa2e848..b488e1859b18 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -431,14 +431,12 @@ __mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr) return NULL; sta->plink_state = NL80211_PLINK_LISTEN; + sta->sta.wme = true; sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); - set_sta_flag(sta, WLAN_STA_WME); - sta->sta.wme = true; - return sta; } @@ -551,11 +549,30 @@ static void mesh_plink_timer(unsigned long data) return; spin_lock_bh(&sta->lock); - if (sta->ignore_plink_timer) { - sta->ignore_plink_timer = false; + + /* If a timer fires just before a state transition on another CPU, + * we may have already extended the timeout and changed state by the + * time we've acquired the lock and arrived here. In that case, + * skip this timer and wait for the new one. + */ + if (time_before(jiffies, sta->plink_timer.expires)) { + mpl_dbg(sta->sdata, + "Ignoring timer for %pM in state %s (timer adjusted)", + sta->sta.addr, mplstates[sta->plink_state]); + spin_unlock_bh(&sta->lock); + return; + } + + /* del_timer() and handler may race when entering these states */ + if (sta->plink_state == NL80211_PLINK_LISTEN || + sta->plink_state == NL80211_PLINK_ESTAB) { + mpl_dbg(sta->sdata, + "Ignoring timer for %pM in state %s (timer deleted)", + sta->sta.addr, mplstates[sta->plink_state]); spin_unlock_bh(&sta->lock); return; } + mpl_dbg(sta->sdata, "Mesh plink timer for %pM fired on state %s\n", sta->sta.addr, mplstates[sta->plink_state]); @@ -773,9 +790,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, break; case CNF_ACPT: sta->plink_state = NL80211_PLINK_CNF_RCVD; - if (!mod_plink_timer(sta, - mshcfg->dot11MeshConfirmTimeout)) - sta->ignore_plink_timer = true; + mod_plink_timer(sta, mshcfg->dot11MeshConfirmTimeout); break; default: break; @@ -834,8 +849,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, case NL80211_PLINK_HOLDING: switch (event) { case CLS_ACPT: - if (del_timer(&sta->plink_timer)) - sta->ignore_plink_timer = 1; + del_timer(&sta->plink_timer); mesh_plink_fsm_restart(sta); break; case OPN_ACPT: @@ -943,7 +957,8 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, if (!matches_local) event = CNF_RJCT; if (!mesh_plink_free_count(sdata) || - (sta->llid != llid || sta->plid != plid)) + sta->llid != llid || + (sta->plid && sta->plid != plid)) event = CNF_IGNR; else event = CNF_ACPT; @@ -987,7 +1002,6 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, enum ieee80211_self_protected_actioncode ftype; u32 changed = 0; u8 ie_len = elems->peering_len; - __le16 _plid, _llid; u16 plid, llid = 0; if (!elems->peering) { @@ -1022,13 +1036,10 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, /* Note the lines below are correct, the llid in the frame is the plid * from the point of view of this host. */ - memcpy(&_plid, PLINK_GET_LLID(elems->peering), sizeof(__le16)); - plid = le16_to_cpu(_plid); + plid = get_unaligned_le16(PLINK_GET_LLID(elems->peering)); if (ftype == WLAN_SP_MESH_PEERING_CONFIRM || - (ftype == WLAN_SP_MESH_PEERING_CLOSE && ie_len == 8)) { - memcpy(&_llid, PLINK_GET_PLID(elems->peering), sizeof(__le16)); - llid = le16_to_cpu(_llid); - } + (ftype == WLAN_SP_MESH_PEERING_CLOSE && ie_len == 8)) + llid = get_unaligned_le16(PLINK_GET_PLID(elems->peering)); /* WARNING: Only for sta pointer, is dropped & re-acquired */ rcu_read_lock(); @@ -1064,6 +1075,10 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, goto unlock_rcu; } + /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */ + if (!sta->plid && event == CNF_ACPT) + sta->plid = plid; + changed |= mesh_plink_fsm(sdata, sta, event); unlock_rcu: diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3345401be1b3..2de88704278b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5,6 +5,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -149,6 +150,7 @@ static u32 ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_channel *channel, + const struct ieee80211_ht_cap *ht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, struct cfg80211_chan_def *chandef, bool tracking) @@ -162,13 +164,19 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, chandef->center_freq1 = channel->center_freq; chandef->center_freq2 = 0; - if (!ht_oper || !sband->ht_cap.ht_supported) { + if (!ht_cap || !ht_oper || !sband->ht_cap.ht_supported) { ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; goto out; } chandef->width = NL80211_CHAN_WIDTH_20; + if (!(ht_cap->cap_info & + cpu_to_le16(IEEE80211_HT_CAP_SUP_WIDTH_20_40))) { + ret = IEEE80211_STA_DISABLE_40MHZ; + goto out; + } + ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan, channel->band); /* check that channel matches the right operating channel */ @@ -328,6 +336,7 @@ out: static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, + const struct ieee80211_ht_cap *ht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, const u8 *bssid, u32 *changed) @@ -367,8 +376,9 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[chan->band]; /* calculate new channel (type) based on HT/VHT operation IEs */ - flags = ieee80211_determine_chantype(sdata, sband, chan, ht_oper, - vht_oper, &chandef, true); + flags = ieee80211_determine_chantype(sdata, sband, chan, + ht_cap, ht_oper, vht_oper, + &chandef, true); /* * Downgrade the new channel if we associated with restricted @@ -663,6 +673,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT)) capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; + if (ifmgd->flags & IEEE80211_STA_ENABLE_RRM) + capab |= WLAN_CAPABILITY_RADIO_MEASURE; + mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); memset(mgmt, 0, 24); memcpy(mgmt->da, assoc_data->bss->bssid, ETH_ALEN); @@ -728,16 +741,17 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) } } - if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT) { - /* 1. power capabilities */ + if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT || + capab & WLAN_CAPABILITY_RADIO_MEASURE) { pos = skb_put(skb, 4); *pos++ = WLAN_EID_PWR_CAPABILITY; *pos++ = 2; *pos++ = 0; /* min tx power */ /* max tx power */ *pos++ = ieee80211_chandef_max_power(&chanctx_conf->def); + } - /* 2. supported channels */ + if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT) { /* TODO: get this in reg domain format */ pos = skb_put(skb, 2 * sband->n_channels + 2); *pos++ = WLAN_EID_SUPPORTED_CHANNELS; @@ -830,16 +844,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) qos_info = 0; } - pos = skb_put(skb, 9); - *pos++ = WLAN_EID_VENDOR_SPECIFIC; - *pos++ = 7; /* len */ - *pos++ = 0x00; /* Microsoft OUI 00:50:F2 */ - *pos++ = 0x50; - *pos++ = 0xf2; - *pos++ = 2; /* WME */ - *pos++ = 0; /* WME info */ - *pos++ = 1; /* WME ver */ - *pos++ = qos_info; + pos = ieee80211_add_wmm_info_ie(skb_put(skb, 9), qos_info); } /* add any remaining custom (i.e. vendor specific here) IEs */ @@ -940,58 +945,77 @@ static void ieee80211_chswitch_work(struct work_struct *work) container_of(work, struct ieee80211_sub_if_data, u.mgd.chswitch_work); struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - u32 changed = 0; int ret; if (!ieee80211_sdata_running(sdata)) return; sdata_lock(sdata); + mutex_lock(&local->mtx); + mutex_lock(&local->chanctx_mtx); + if (!ifmgd->associated) goto out; - mutex_lock(&local->mtx); - ret = ieee80211_vif_change_channel(sdata, &changed); - mutex_unlock(&local->mtx); - if (ret) { + if (!sdata->vif.csa_active) + goto out; + + /* + * using reservation isn't immediate as it may be deferred until later + * with multi-vif. once reservation is complete it will re-schedule the + * work with no reserved_chanctx so verify chandef to check if it + * completed successfully + */ + + if (sdata->reserved_chanctx) { + /* + * with multi-vif csa driver may call ieee80211_csa_finish() + * many times while waiting for other interfaces to use their + * reservations + */ + if (sdata->reserved_ready) + goto out; + + ret = ieee80211_vif_use_reserved_context(sdata); + if (ret) { + sdata_info(sdata, + "failed to use reserved channel context, disconnecting (err=%d)\n", + ret); + ieee80211_queue_work(&sdata->local->hw, + &ifmgd->csa_connection_drop_work); + goto out; + } + + goto out; + } + + if (!cfg80211_chandef_identical(&sdata->vif.bss_conf.chandef, + &sdata->csa_chandef)) { sdata_info(sdata, - "vif channel switch failed, disconnecting\n"); + "failed to finalize channel switch, disconnecting\n"); ieee80211_queue_work(&sdata->local->hw, &ifmgd->csa_connection_drop_work); goto out; } - if (!local->use_chanctx) { - local->_oper_chandef = sdata->csa_chandef; - /* Call "hw_config" only if doing sw channel switch. - * Otherwise update the channel directly - */ - if (!local->ops->channel_switch) - ieee80211_hw_config(local, 0); - else - local->hw.conf.chandef = local->_oper_chandef; - } - /* XXX: shouldn't really modify cfg80211-owned data! */ ifmgd->associated->channel = sdata->csa_chandef.chan; - ieee80211_bss_info_change_notify(sdata, changed); - - mutex_lock(&local->mtx); sdata->vif.csa_active = false; - /* XXX: wait for a beacon first? */ - if (!ieee80211_csa_needs_block_tx(local)) - ieee80211_wake_queues_by_reason(&local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); - mutex_unlock(&local->mtx); - ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED; + /* XXX: wait for a beacon first? */ + if (sdata->csa_block_tx) { + ieee80211_wake_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_CSA); + sdata->csa_block_tx = false; + } ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); out: + mutex_unlock(&local->chanctx_mtx); + mutex_unlock(&local->mtx); sdata_unlock(sdata); } @@ -1028,6 +1052,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct cfg80211_bss *cbss = ifmgd->associated; + struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *chanctx; enum ieee80211_band current_band; struct ieee80211_csa_ie csa_ie; @@ -1042,7 +1067,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; /* disregard subsequent announcements if we are already processing */ - if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED) + if (sdata->vif.csa_active) return; current_band = cbss->channel->band; @@ -1069,9 +1094,22 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; } - ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED; - + mutex_lock(&local->mtx); mutex_lock(&local->chanctx_mtx); + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (!conf) { + sdata_info(sdata, + "no channel context assigned to vif?, disconnecting\n"); + ieee80211_queue_work(&local->hw, + &ifmgd->csa_connection_drop_work); + mutex_unlock(&local->chanctx_mtx); + mutex_unlock(&local->mtx); + return; + } + + chanctx = container_of(conf, struct ieee80211_chanctx, conf); + if (local->use_chanctx) { u32 num_chanctx = 0; list_for_each_entry(chanctx, &local->chanctx_list, list) @@ -1084,38 +1122,32 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); mutex_unlock(&local->chanctx_mtx); + mutex_unlock(&local->mtx); return; } } - if (WARN_ON(!rcu_access_pointer(sdata->vif.chanctx_conf))) { - ieee80211_queue_work(&local->hw, - &ifmgd->csa_connection_drop_work); - mutex_unlock(&local->chanctx_mtx); - return; - } - chanctx = container_of(rcu_access_pointer(sdata->vif.chanctx_conf), - struct ieee80211_chanctx, conf); - if (ieee80211_chanctx_refcount(local, chanctx) > 1) { + res = ieee80211_vif_reserve_chanctx(sdata, &csa_ie.chandef, + chanctx->mode, false); + if (res) { sdata_info(sdata, - "channel switch with multiple interfaces on the same channel, disconnecting\n"); + "failed to reserve channel context for channel switch, disconnecting (err=%d)\n", + res); ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); mutex_unlock(&local->chanctx_mtx); + mutex_unlock(&local->mtx); return; } mutex_unlock(&local->chanctx_mtx); - sdata->csa_chandef = csa_ie.chandef; - - mutex_lock(&local->mtx); sdata->vif.csa_active = true; + sdata->csa_chandef = csa_ie.chandef; sdata->csa_block_tx = csa_ie.mode; if (sdata->csa_block_tx) - ieee80211_stop_queues_by_reason(&local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); + ieee80211_stop_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_CSA); mutex_unlock(&local->mtx); if (local->ops->channel_switch) { @@ -1139,19 +1171,21 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, TU_TO_EXP_TIME(csa_ie.count * cbss->beacon_interval)); } -static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, - struct ieee80211_channel *channel, - const u8 *country_ie, u8 country_ie_len, - const u8 *pwr_constr_elem) +static bool +ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *channel, + const u8 *country_ie, u8 country_ie_len, + const u8 *pwr_constr_elem, + int *chan_pwr, int *pwr_reduction) { struct ieee80211_country_ie_triplet *triplet; int chan = ieee80211_frequency_to_channel(channel->center_freq); - int i, chan_pwr, chan_increment, new_ap_level; + int i, chan_increment; bool have_chan_pwr = false; /* Invalid IE */ if (country_ie_len % 2 || country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) - return 0; + return false; triplet = (void *)(country_ie + 3); country_ie_len -= 3; @@ -1179,7 +1213,7 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, for (i = 0; i < triplet->chans.num_channels; i++) { if (first_channel + i * chan_increment == chan) { have_chan_pwr = true; - chan_pwr = triplet->chans.max_power; + *chan_pwr = triplet->chans.max_power; break; } } @@ -1191,18 +1225,76 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, country_ie_len -= 3; } - if (!have_chan_pwr) + if (have_chan_pwr) + *pwr_reduction = *pwr_constr_elem; + return have_chan_pwr; +} + +static void ieee80211_find_cisco_dtpc(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *channel, + const u8 *cisco_dtpc_ie, + int *pwr_level) +{ + /* From practical testing, the first data byte of the DTPC element + * seems to contain the requested dBm level, and the CLI on Cisco + * APs clearly state the range is -127 to 127 dBm, which indicates + * a signed byte, although it seemingly never actually goes negative. + * The other byte seems to always be zero. + */ + *pwr_level = (__s8)cisco_dtpc_ie[4]; +} + +static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *channel, + struct ieee80211_mgmt *mgmt, + const u8 *country_ie, u8 country_ie_len, + const u8 *pwr_constr_ie, + const u8 *cisco_dtpc_ie) +{ + bool has_80211h_pwr = false, has_cisco_pwr = false; + int chan_pwr = 0, pwr_reduction_80211h = 0; + int pwr_level_cisco, pwr_level_80211h; + int new_ap_level; + + if (country_ie && pwr_constr_ie && + mgmt->u.probe_resp.capab_info & + cpu_to_le16(WLAN_CAPABILITY_SPECTRUM_MGMT)) { + has_80211h_pwr = ieee80211_find_80211h_pwr_constr( + sdata, channel, country_ie, country_ie_len, + pwr_constr_ie, &chan_pwr, &pwr_reduction_80211h); + pwr_level_80211h = + max_t(int, 0, chan_pwr - pwr_reduction_80211h); + } + + if (cisco_dtpc_ie) { + ieee80211_find_cisco_dtpc( + sdata, channel, cisco_dtpc_ie, &pwr_level_cisco); + has_cisco_pwr = true; + } + + if (!has_80211h_pwr && !has_cisco_pwr) return 0; - new_ap_level = max_t(int, 0, chan_pwr - *pwr_constr_elem); + /* If we have both 802.11h and Cisco DTPC, apply both limits + * by picking the smallest of the two power levels advertised. + */ + if (has_80211h_pwr && + (!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) { + sdata_info(sdata, + "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", + pwr_level_80211h, chan_pwr, pwr_reduction_80211h, + sdata->u.mgd.bssid); + new_ap_level = pwr_level_80211h; + } else { /* has_cisco_pwr is always true here. */ + sdata_info(sdata, + "Limiting TX power to %d dBm as advertised by %pM\n", + pwr_level_cisco, sdata->u.mgd.bssid); + new_ap_level = pwr_level_cisco; + } if (sdata->ap_power_level == new_ap_level) return 0; - sdata_info(sdata, - "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", - new_ap_level, chan_pwr, *pwr_constr_elem, - sdata->u.mgd.bssid); sdata->ap_power_level = new_ap_level; if (__ieee80211_recalc_txpower(sdata)) return BSS_CHANGED_TXPOWER; @@ -1385,7 +1477,8 @@ void ieee80211_dynamic_ps_disable_work(struct work_struct *work) ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_PS); + IEEE80211_QUEUE_STOP_REASON_PS, + false); } void ieee80211_dynamic_ps_enable_work(struct work_struct *work) @@ -1830,10 +1923,11 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, ieee80211_vif_release_channel(sdata); sdata->vif.csa_active = false; - if (!ieee80211_csa_needs_block_tx(local)) - ieee80211_wake_queues_by_reason(&local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); + if (sdata->csa_block_tx) { + ieee80211_wake_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_CSA); + sdata->csa_block_tx = false; + } mutex_unlock(&local->mtx); sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM; @@ -2075,14 +2169,13 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, true, frame_buf); - ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED; - mutex_lock(&local->mtx); sdata->vif.csa_active = false; - if (!ieee80211_csa_needs_block_tx(local)) - ieee80211_wake_queues_by_reason(&local->hw, - IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_CSA); + if (sdata->csa_block_tx) { + ieee80211_wake_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_CSA); + sdata->csa_block_tx = false; + } mutex_unlock(&local->mtx); cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, @@ -2658,8 +2751,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) set_sta_flag(sta, WLAN_STA_MFP); - if (elems.wmm_param) - set_sta_flag(sta, WLAN_STA_WME); + sta->sta.wme = elems.wmm_param; err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) @@ -2725,6 +2817,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; u16 capab_info, status_code, aid; struct ieee802_11_elems elems; + int ac, uapsd_queues = -1; u8 *pos; bool reassoc; struct cfg80211_bss *bss; @@ -2794,9 +2887,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, * is set can cause the interface to go idle */ ieee80211_destroy_assoc_data(sdata, true); + + /* get uapsd queues configuration */ + uapsd_queues = 0; + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + if (sdata->tx_conf[ac].uapsd) + uapsd_queues |= BIT(ac); } - cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len); + cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len, uapsd_queues); } static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, @@ -2866,7 +2965,9 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, /* * This is the canonical list of information elements we care about, * the filter code also gives us all changes to the Microsoft OUI - * (00:50:F2) vendor IE which is used for WMM which we need to track. + * (00:50:F2) vendor IE which is used for WMM which we need to track, + * as well as the DTPC IE (part of the Cisco OUI) used for signaling + * changes to requested client power. * * We implement beacon filtering in software since that means we can * avoid processing the frame here and in cfg80211, and userspace @@ -3155,7 +3256,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, bssid); - if (ieee80211_config_bw(sdata, sta, elems.ht_operation, + if (ieee80211_config_bw(sdata, sta, + elems.ht_cap_elem, elems.ht_operation, elems.vht_operation, bssid, &changed)) { mutex_unlock(&local->sta_mtx); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, @@ -3171,13 +3273,11 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, rx_status->band, true); mutex_unlock(&local->sta_mtx); - if (elems.country_elem && elems.pwr_constr_elem && - mgmt->u.probe_resp.capab_info & - cpu_to_le16(WLAN_CAPABILITY_SPECTRUM_MGMT)) - changed |= ieee80211_handle_pwr_constr(sdata, chan, - elems.country_elem, - elems.country_elem_len, - elems.pwr_constr_elem); + changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt, + elems.country_elem, + elems.country_elem_len, + elems.pwr_constr_elem, + elems.cisco_dtpc_elem); ieee80211_bss_info_change_notify(sdata, changed); } @@ -3688,6 +3788,8 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) INIT_WORK(&ifmgd->csa_connection_drop_work, ieee80211_csa_connection_drop_work); INIT_WORK(&ifmgd->request_smps_work, ieee80211_request_smps_mgd_work); + INIT_DELAYED_WORK(&ifmgd->tdls_peer_del_work, + ieee80211_tdls_peer_del_work); setup_timer(&ifmgd->timer, ieee80211_sta_timer, (unsigned long) sdata); setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, @@ -3703,7 +3805,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ifmgd->uapsd_max_sp_len = sdata->local->hw.uapsd_max_sp_len; ifmgd->p2p_noa_index = -1; - if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) + if (sdata->local->hw.wiphy->features & NL80211_FEATURE_DYNAMIC_SMPS) ifmgd->req_smps = IEEE80211_SMPS_AUTOMATIC; else ifmgd->req_smps = IEEE80211_SMPS_OFF; @@ -3787,6 +3889,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + const struct ieee80211_ht_cap *ht_cap = NULL; const struct ieee80211_ht_operation *ht_oper = NULL; const struct ieee80211_vht_operation *vht_oper = NULL; struct ieee80211_supported_band *sband; @@ -3803,14 +3906,17 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && sband->ht_cap.ht_supported) { - const u8 *ht_oper_ie, *ht_cap; + const u8 *ht_oper_ie, *ht_cap_ie; ht_oper_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_OPERATION); if (ht_oper_ie && ht_oper_ie[1] >= sizeof(*ht_oper)) ht_oper = (void *)(ht_oper_ie + 2); - ht_cap = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_CAPABILITY); - if (!ht_cap || ht_cap[1] < sizeof(struct ieee80211_ht_cap)) { + ht_cap_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_CAPABILITY); + if (ht_cap_ie && ht_cap_ie[1] >= sizeof(*ht_cap)) + ht_cap = (void *)(ht_cap_ie + 2); + + if (!ht_cap) { ifmgd->flags |= IEEE80211_STA_DISABLE_HT; ht_oper = NULL; } @@ -3841,7 +3947,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, - ht_oper, vht_oper, + ht_cap, ht_oper, vht_oper, &chandef, false); sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss), @@ -4355,8 +4461,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); if (bss->wmm_used && bss->uapsd_supported && - (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) && - sdata->wmm_acm != 0xff) { + (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD)) { assoc_data->uapsd = true; ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED; } else { @@ -4375,6 +4480,11 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, ifmgd->flags &= ~IEEE80211_STA_MFP_ENABLED; } + if (req->flags & ASSOC_REQ_USE_RRM) + ifmgd->flags |= IEEE80211_STA_ENABLE_RRM; + else + ifmgd->flags &= ~IEEE80211_STA_ENABLE_RRM; + if (req->crypto.control_port) ifmgd->flags |= IEEE80211_STA_CONTROL_PORT; else @@ -4551,6 +4661,7 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) cancel_work_sync(&ifmgd->request_smps_work); cancel_work_sync(&ifmgd->csa_connection_drop_work); cancel_work_sync(&ifmgd->chswitch_work); + cancel_delayed_work_sync(&ifmgd->tdls_peer_del_work); sdata_lock(sdata); if (ifmgd->assoc_data) { diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 7a17decd27f9..ff20b2ebdb30 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -119,7 +119,8 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local) * before sending nullfunc to enable powersave at the AP. */ ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL); + IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, + false); ieee80211_flush_queues(local, NULL); mutex_lock(&local->iflist_mtx); @@ -182,7 +183,8 @@ void ieee80211_offchannel_return(struct ieee80211_local *local) mutex_unlock(&local->iflist_mtx); ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL); + IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, + false); } void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc) diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index d478b880a0af..4c5192e0d66c 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -35,7 +35,8 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_SUSPEND); + IEEE80211_QUEUE_STOP_REASON_SUSPEND, + false); /* flush out all packets */ synchronize_net(); @@ -74,7 +75,8 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) } ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_SUSPEND); + IEEE80211_QUEUE_STOP_REASON_SUSPEND, + false); return err; } else if (err > 0) { WARN_ON(err != 1); diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 9aa2a1190a86..18babe302832 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -143,19 +143,6 @@ void rate_control_deinitialize(struct ieee80211_local *local); /* Rate control algorithms */ -#ifdef CONFIG_MAC80211_RC_PID -int rc80211_pid_init(void); -void rc80211_pid_exit(void); -#else -static inline int rc80211_pid_init(void) -{ - return 0; -} -static inline void rc80211_pid_exit(void) -{ -} -#endif - #ifdef CONFIG_MAC80211_RC_MINSTREL int rc80211_minstrel_init(void); void rc80211_minstrel_exit(void); diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 1c1469c36dca..2baa7ed8789d 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -75,7 +75,7 @@ minstrel_sort_best_tp_rates(struct minstrel_sta_info *mi, int i, u8 *tp_list) { int j = MAX_THR_RATES; - while (j > 0 && mi->r[i].cur_tp > mi->r[tp_list[j - 1]].cur_tp) + while (j > 0 && mi->r[i].stats.cur_tp > mi->r[tp_list[j - 1]].stats.cur_tp) j--; if (j < MAX_THR_RATES - 1) memmove(&tp_list[j + 1], &tp_list[j], MAX_THR_RATES - (j + 1)); @@ -92,7 +92,7 @@ minstrel_set_rate(struct minstrel_sta_info *mi, struct ieee80211_sta_rates *rate ratetbl->rate[offset].idx = r->rix; ratetbl->rate[offset].count = r->adjusted_retry_count; ratetbl->rate[offset].count_cts = r->retry_count_cts; - ratetbl->rate[offset].count_rts = r->retry_count_rtscts; + ratetbl->rate[offset].count_rts = r->stats.retry_count_rtscts; } static void @@ -140,44 +140,46 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; + struct minstrel_rate_stats *mrs = &mi->r[i].stats; usecs = mr->perfect_tx_time; if (!usecs) usecs = 1000000; - if (unlikely(mr->attempts > 0)) { - mr->sample_skipped = 0; - mr->cur_prob = MINSTREL_FRAC(mr->success, mr->attempts); - mr->succ_hist += mr->success; - mr->att_hist += mr->attempts; - mr->probability = minstrel_ewma(mr->probability, - mr->cur_prob, - EWMA_LEVEL); + if (unlikely(mrs->attempts > 0)) { + mrs->sample_skipped = 0; + mrs->cur_prob = MINSTREL_FRAC(mrs->success, + mrs->attempts); + mrs->succ_hist += mrs->success; + mrs->att_hist += mrs->attempts; + mrs->probability = minstrel_ewma(mrs->probability, + mrs->cur_prob, + EWMA_LEVEL); } else - mr->sample_skipped++; + mrs->sample_skipped++; - mr->last_success = mr->success; - mr->last_attempts = mr->attempts; - mr->success = 0; - mr->attempts = 0; + mrs->last_success = mrs->success; + mrs->last_attempts = mrs->attempts; + mrs->success = 0; + mrs->attempts = 0; /* Update throughput per rate, reset thr. below 10% success */ - if (mr->probability < MINSTREL_FRAC(10, 100)) - mr->cur_tp = 0; + if (mrs->probability < MINSTREL_FRAC(10, 100)) + mrs->cur_tp = 0; else - mr->cur_tp = mr->probability * (1000000 / usecs); + mrs->cur_tp = mrs->probability * (1000000 / usecs); /* Sample less often below the 10% chance of success. * Sample less often above the 95% chance of success. */ - if (mr->probability > MINSTREL_FRAC(95, 100) || - mr->probability < MINSTREL_FRAC(10, 100)) { - mr->adjusted_retry_count = mr->retry_count >> 1; + if (mrs->probability > MINSTREL_FRAC(95, 100) || + mrs->probability < MINSTREL_FRAC(10, 100)) { + mr->adjusted_retry_count = mrs->retry_count >> 1; if (mr->adjusted_retry_count > 2) mr->adjusted_retry_count = 2; mr->sample_limit = 4; } else { mr->sample_limit = -1; - mr->adjusted_retry_count = mr->retry_count; + mr->adjusted_retry_count = mrs->retry_count; } if (!mr->adjusted_retry_count) mr->adjusted_retry_count = 2; @@ -190,11 +192,11 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) * choose the maximum throughput rate as max_prob_rate * (2) if all success probabilities < 95%, the rate with * highest success probability is choosen as max_prob_rate */ - if (mr->probability >= MINSTREL_FRAC(95, 100)) { - if (mr->cur_tp >= mi->r[tmp_prob_rate].cur_tp) + if (mrs->probability >= MINSTREL_FRAC(95, 100)) { + if (mrs->cur_tp >= mi->r[tmp_prob_rate].stats.cur_tp) tmp_prob_rate = i; } else { - if (mr->probability >= mi->r[tmp_prob_rate].probability) + if (mrs->probability >= mi->r[tmp_prob_rate].stats.probability) tmp_prob_rate = i; } } @@ -240,14 +242,14 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, if (ndx < 0) continue; - mi->r[ndx].attempts += ar[i].count; + mi->r[ndx].stats.attempts += ar[i].count; if ((i != IEEE80211_TX_MAX_RATES - 1) && (ar[i + 1].idx < 0)) - mi->r[ndx].success += success; + mi->r[ndx].stats.success += success; } if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && (i >= 0)) - mi->sample_count++; + mi->sample_packets++; if (mi->sample_deferred > 0) mi->sample_deferred--; @@ -265,7 +267,7 @@ minstrel_get_retry_count(struct minstrel_rate *mr, unsigned int retry = mr->adjusted_retry_count; if (info->control.use_rts) - retry = max(2U, min(mr->retry_count_rtscts, retry)); + retry = max(2U, min(mr->stats.retry_count_rtscts, retry)); else if (info->control.use_cts_prot) retry = max(2U, min(mr->retry_count_cts, retry)); return retry; @@ -317,15 +319,15 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, sampling_ratio = mp->lookaround_rate; /* increase sum packet counter */ - mi->packet_count++; + mi->total_packets++; #ifdef CONFIG_MAC80211_DEBUGFS if (mp->fixed_rate_idx != -1) return; #endif - delta = (mi->packet_count * sampling_ratio / 100) - - (mi->sample_count + mi->sample_deferred / 2); + delta = (mi->total_packets * sampling_ratio / 100) - + (mi->sample_packets + mi->sample_deferred / 2); /* delta < 0: no sampling required */ prev_sample = mi->prev_sample; @@ -333,10 +335,10 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, if (delta < 0 || (!mrr_capable && prev_sample)) return; - if (mi->packet_count >= 10000) { + if (mi->total_packets >= 10000) { mi->sample_deferred = 0; - mi->sample_count = 0; - mi->packet_count = 0; + mi->sample_packets = 0; + mi->total_packets = 0; } else if (delta > mi->n_rates * 2) { /* With multi-rate retry, not every planned sample * attempt actually gets used, due to the way the retry @@ -347,7 +349,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, * starts getting worse, minstrel would start bursting * out lots of sampling frames, which would result * in a large throughput loss. */ - mi->sample_count += (delta - mi->n_rates * 2); + mi->sample_packets += (delta - mi->n_rates * 2); } /* get next random rate sample */ @@ -361,7 +363,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, */ if (mrr_capable && msr->perfect_tx_time > mr->perfect_tx_time && - msr->sample_skipped < 20) { + msr->stats.sample_skipped < 20) { /* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark * packets that have the sampling rate deferred to the * second MRR stage. Increase the sample counter only @@ -375,7 +377,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, if (!msr->sample_limit != 0) return; - mi->sample_count++; + mi->sample_packets++; if (msr->sample_limit > 0) msr->sample_limit--; } @@ -384,7 +386,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, * has a probability of >95%, we shouldn't be attempting * to use it, as this only wastes precious airtime */ if (!mrr_capable && - (mi->r[ndx].probability > MINSTREL_FRAC(95, 100))) + (mi->r[ndx].stats.probability > MINSTREL_FRAC(95, 100))) return; mi->prev_sample = true; @@ -459,6 +461,7 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, for (i = 0; i < sband->n_bitrates; i++) { struct minstrel_rate *mr = &mi->r[n]; + struct minstrel_rate_stats *mrs = &mi->r[n].stats; unsigned int tx_time = 0, tx_time_cts = 0, tx_time_rtscts = 0; unsigned int tx_time_single; unsigned int cw = mp->cw_min; @@ -471,6 +474,7 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, n++; memset(mr, 0, sizeof(*mr)); + memset(mrs, 0, sizeof(*mrs)); mr->rix = i; shift = ieee80211_chandef_get_shift(chandef); @@ -482,9 +486,9 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, /* calculate maximum number of retransmissions before * fallback (based on maximum segment size) */ mr->sample_limit = -1; - mr->retry_count = 1; + mrs->retry_count = 1; mr->retry_count_cts = 1; - mr->retry_count_rtscts = 1; + mrs->retry_count_rtscts = 1; tx_time = mr->perfect_tx_time + mi->sp_ack_dur; do { /* add one retransmission */ @@ -501,13 +505,13 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband, (mr->retry_count_cts < mp->max_retry)) mr->retry_count_cts++; if ((tx_time_rtscts < mp->segment_size) && - (mr->retry_count_rtscts < mp->max_retry)) - mr->retry_count_rtscts++; + (mrs->retry_count_rtscts < mp->max_retry)) + mrs->retry_count_rtscts++; } while ((tx_time < mp->segment_size) && - (++mr->retry_count < mp->max_retry)); - mr->adjusted_retry_count = mr->retry_count; + (++mr->stats.retry_count < mp->max_retry)); + mr->adjusted_retry_count = mrs->retry_count; if (!(sband->bitrates[i].flags & IEEE80211_RATE_ERP_G)) - mr->retry_count_cts = mr->retry_count; + mr->retry_count_cts = mrs->retry_count; } for (i = n; i < sband->n_bitrates; i++) { @@ -665,7 +669,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta) /* convert pkt per sec in kbps (1200 is the average pkt size used for * computing cur_tp */ - return MINSTREL_TRUNC(mi->r[idx].cur_tp) * 1200 * 8 / 1024; + return MINSTREL_TRUNC(mi->r[idx].stats.cur_tp) * 1200 * 8 / 1024; } const struct rate_control_ops mac80211_minstrel = { diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index 046d1bd598a8..97eca86a4af0 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -31,6 +31,27 @@ minstrel_ewma(int old, int new, int weight) return (new * (EWMA_DIV - weight) + old * weight) / EWMA_DIV; } +struct minstrel_rate_stats { + /* current / last sampling period attempts/success counters */ + unsigned int attempts, last_attempts; + unsigned int success, last_success; + + /* total attempts/success counters */ + u64 att_hist, succ_hist; + + /* current throughput */ + unsigned int cur_tp; + + /* packet delivery probabilities */ + unsigned int cur_prob, probability; + + /* maximum retry counts */ + unsigned int retry_count; + unsigned int retry_count_rtscts; + + u8 sample_skipped; + bool retry_updated; +}; struct minstrel_rate { int bitrate; @@ -40,26 +61,10 @@ struct minstrel_rate { unsigned int ack_time; int sample_limit; - unsigned int retry_count; unsigned int retry_count_cts; - unsigned int retry_count_rtscts; unsigned int adjusted_retry_count; - u32 success; - u32 attempts; - u32 last_attempts; - u32 last_success; - u8 sample_skipped; - - /* parts per thousand */ - u32 cur_prob; - u32 probability; - - /* per-rate throughput */ - u32 cur_tp; - - u64 succ_hist; - u64 att_hist; + struct minstrel_rate_stats stats; }; struct minstrel_sta_info { @@ -73,8 +78,8 @@ struct minstrel_sta_info { u8 max_tp_rate[MAX_THR_RATES]; u8 max_prob_rate; - unsigned int packet_count; - unsigned int sample_count; + unsigned int total_packets; + unsigned int sample_packets; int sample_deferred; unsigned int sample_row; diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index fd0b9ca1570e..edde723f9f00 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -72,6 +72,7 @@ minstrel_stats_open(struct inode *inode, struct file *file) "this succ/attempt success attempts\n"); for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; + struct minstrel_rate_stats *mrs = &mi->r[i].stats; *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' '; *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' '; @@ -81,24 +82,24 @@ minstrel_stats_open(struct inode *inode, struct file *file) p += sprintf(p, "%3u%s", mr->bitrate / 2, (mr->bitrate & 1 ? ".5" : " ")); - tp = MINSTREL_TRUNC(mr->cur_tp / 10); - prob = MINSTREL_TRUNC(mr->cur_prob * 1000); - eprob = MINSTREL_TRUNC(mr->probability * 1000); + tp = MINSTREL_TRUNC(mrs->cur_tp / 10); + prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); + eprob = MINSTREL_TRUNC(mrs->probability * 1000); p += sprintf(p, " %6u.%1u %6u.%1u %6u.%1u " " %3u(%3u) %8llu %8llu\n", tp / 10, tp % 10, eprob / 10, eprob % 10, prob / 10, prob % 10, - mr->last_success, - mr->last_attempts, - (unsigned long long)mr->succ_hist, - (unsigned long long)mr->att_hist); + mrs->last_success, + mrs->last_attempts, + (unsigned long long)mrs->succ_hist, + (unsigned long long)mrs->att_hist); } p += sprintf(p, "\nTotal packet count:: ideal %d " "lookaround %d\n\n", - mi->packet_count - mi->sample_count, - mi->sample_count); + mi->total_packets - mi->sample_packets, + mi->sample_packets); ms->len = p - ms->buf; return 0; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 85c1e74b7714..df90ce2db00c 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -135,7 +135,7 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi); static int minstrel_ht_get_group_idx(struct ieee80211_tx_rate *rate) { - return GROUP_IDX((rate->idx / 8) + 1, + return GROUP_IDX((rate->idx / MCS_GROUP_RATES) + 1, !!(rate->flags & IEEE80211_TX_RC_SHORT_GI), !!(rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)); } @@ -233,12 +233,151 @@ minstrel_ht_calc_tp(struct minstrel_ht_sta *mi, int group, int rate) } /* + * Find & sort topmost throughput rates + * + * If multiple rates provide equal throughput the sorting is based on their + * current success probability. Higher success probability is preferred among + * MCS groups, CCK rates do not provide aggregation and are therefore at last. + */ +static void +minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u8 index, + u8 *tp_list) +{ + int cur_group, cur_idx, cur_thr, cur_prob; + int tmp_group, tmp_idx, tmp_thr, tmp_prob; + int j = MAX_THR_RATES; + + cur_group = index / MCS_GROUP_RATES; + cur_idx = index % MCS_GROUP_RATES; + cur_thr = mi->groups[cur_group].rates[cur_idx].cur_tp; + cur_prob = mi->groups[cur_group].rates[cur_idx].probability; + + tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; + tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; + tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; + + while (j > 0 && (cur_thr > tmp_thr || + (cur_thr == tmp_thr && cur_prob > tmp_prob))) { + j--; + tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; + tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; + tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; + } + + if (j < MAX_THR_RATES - 1) { + memmove(&tp_list[j + 1], &tp_list[j], (sizeof(*tp_list) * + (MAX_THR_RATES - (j + 1)))); + } + if (j < MAX_THR_RATES) + tp_list[j] = index; +} + +/* + * Find and set the topmost probability rate per sta and per group + */ +static void +minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u8 index) +{ + struct minstrel_mcs_group_data *mg; + struct minstrel_rate_stats *mr; + int tmp_group, tmp_idx, tmp_tp, tmp_prob, max_tp_group; + + mg = &mi->groups[index / MCS_GROUP_RATES]; + mr = &mg->rates[index % MCS_GROUP_RATES]; + + tmp_group = mi->max_prob_rate / MCS_GROUP_RATES; + tmp_idx = mi->max_prob_rate % MCS_GROUP_RATES; + tmp_tp = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; + + /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from + * MCS_GROUP as well as CCK_GROUP rates do not allow aggregation */ + max_tp_group = mi->max_tp_rate[0] / MCS_GROUP_RATES; + if((index / MCS_GROUP_RATES == MINSTREL_CCK_GROUP) && + (max_tp_group != MINSTREL_CCK_GROUP)) + return; + + if (mr->probability > MINSTREL_FRAC(75, 100)) { + if (mr->cur_tp > tmp_tp) + mi->max_prob_rate = index; + if (mr->cur_tp > mg->rates[mg->max_group_prob_rate].cur_tp) + mg->max_group_prob_rate = index; + } else { + if (mr->probability > tmp_prob) + mi->max_prob_rate = index; + if (mr->probability > mg->rates[mg->max_group_prob_rate].probability) + mg->max_group_prob_rate = index; + } +} + + +/* + * Assign new rate set per sta and use CCK rates only if the fastest + * rate (max_tp_rate[0]) is from CCK group. This prohibits such sorted + * rate sets where MCS and CCK rates are mixed, because CCK rates can + * not use aggregation. + */ +static void +minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, + u8 tmp_mcs_tp_rate[MAX_THR_RATES], + u8 tmp_cck_tp_rate[MAX_THR_RATES]) +{ + unsigned int tmp_group, tmp_idx, tmp_cck_tp, tmp_mcs_tp; + int i; + + tmp_group = tmp_cck_tp_rate[0] / MCS_GROUP_RATES; + tmp_idx = tmp_cck_tp_rate[0] % MCS_GROUP_RATES; + tmp_cck_tp = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + + tmp_group = tmp_mcs_tp_rate[0] / MCS_GROUP_RATES; + tmp_idx = tmp_mcs_tp_rate[0] % MCS_GROUP_RATES; + tmp_mcs_tp = mi->groups[tmp_group].rates[tmp_idx].cur_tp; + + if (tmp_cck_tp > tmp_mcs_tp) { + for(i = 0; i < MAX_THR_RATES; i++) { + minstrel_ht_sort_best_tp_rates(mi, tmp_cck_tp_rate[i], + tmp_mcs_tp_rate); + } + } + +} + +/* + * Try to increase robustness of max_prob rate by decrease number of + * streams if possible. + */ +static inline void +minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) +{ + struct minstrel_mcs_group_data *mg; + struct minstrel_rate_stats *mr; + int tmp_max_streams, group; + int tmp_tp = 0; + + tmp_max_streams = minstrel_mcs_groups[mi->max_tp_rate[0] / + MCS_GROUP_RATES].streams; + for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { + mg = &mi->groups[group]; + if (!mg->supported || group == MINSTREL_CCK_GROUP) + continue; + mr = minstrel_get_ratestats(mi, mg->max_group_prob_rate); + if (tmp_tp < mr->cur_tp && + (minstrel_mcs_groups[group].streams < tmp_max_streams)) { + mi->max_prob_rate = mg->max_group_prob_rate; + tmp_tp = mr->cur_tp; + } + } +} + +/* * Update rate statistics and select new primary rates * * Rules for rate selection: * - max_prob_rate must use only one stream, as a tradeoff between delivery * probability and throughput during strong fluctuations - * - as long as the max prob rate has a probability of more than 3/4, pick + * - as long as the max prob rate has a probability of more than 75%, pick * higher throughput rates, even if the probablity is a bit lower */ static void @@ -246,9 +385,9 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mr; - int cur_prob, cur_prob_tp, cur_tp, cur_tp2; - int group, i, index; - bool mi_rates_valid = false; + int group, i, j; + u8 tmp_mcs_tp_rate[MAX_THR_RATES], tmp_group_tp_rate[MAX_THR_RATES]; + u8 tmp_cck_tp_rate[MAX_THR_RATES], index; if (mi->ampdu_packets > 0) { mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len, @@ -260,13 +399,14 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) mi->sample_slow = 0; mi->sample_count = 0; - for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { - bool mg_rates_valid = false; + /* Initialize global rate indexes */ + for(j = 0; j < MAX_THR_RATES; j++){ + tmp_mcs_tp_rate[j] = 0; + tmp_cck_tp_rate[j] = 0; + } - cur_prob = 0; - cur_prob_tp = 0; - cur_tp = 0; - cur_tp2 = 0; + /* Find best rate sets within all MCS groups*/ + for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { mg = &mi->groups[group]; if (!mg->supported) @@ -274,24 +414,16 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) mi->sample_count++; + /* (re)Initialize group rate indexes */ + for(j = 0; j < MAX_THR_RATES; j++) + tmp_group_tp_rate[j] = group; + for (i = 0; i < MCS_GROUP_RATES; i++) { if (!(mg->supported & BIT(i))) continue; index = MCS_GROUP_RATES * group + i; - /* initialize rates selections starting indexes */ - if (!mg_rates_valid) { - mg->max_tp_rate = mg->max_tp_rate2 = - mg->max_prob_rate = i; - if (!mi_rates_valid) { - mi->max_tp_rate = mi->max_tp_rate2 = - mi->max_prob_rate = index; - mi_rates_valid = true; - } - mg_rates_valid = true; - } - mr = &mg->rates[i]; mr->retry_updated = false; minstrel_calc_rate_ewma(mr); @@ -300,82 +432,47 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) if (!mr->cur_tp) continue; - if ((mr->cur_tp > cur_prob_tp && mr->probability > - MINSTREL_FRAC(3, 4)) || mr->probability > cur_prob) { - mg->max_prob_rate = index; - cur_prob = mr->probability; - cur_prob_tp = mr->cur_tp; - } - - if (mr->cur_tp > cur_tp) { - swap(index, mg->max_tp_rate); - cur_tp = mr->cur_tp; - mr = minstrel_get_ratestats(mi, index); - } - - if (index >= mg->max_tp_rate) - continue; - - if (mr->cur_tp > cur_tp2) { - mg->max_tp_rate2 = index; - cur_tp2 = mr->cur_tp; + /* Find max throughput rate set */ + if (group != MINSTREL_CCK_GROUP) { + minstrel_ht_sort_best_tp_rates(mi, index, + tmp_mcs_tp_rate); + } else if (group == MINSTREL_CCK_GROUP) { + minstrel_ht_sort_best_tp_rates(mi, index, + tmp_cck_tp_rate); } - } - } - /* try to sample all available rates during each interval */ - mi->sample_count *= 8; + /* Find max throughput rate set within a group */ + minstrel_ht_sort_best_tp_rates(mi, index, + tmp_group_tp_rate); - cur_prob = 0; - cur_prob_tp = 0; - cur_tp = 0; - cur_tp2 = 0; - for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { - mg = &mi->groups[group]; - if (!mg->supported) - continue; - - mr = minstrel_get_ratestats(mi, mg->max_tp_rate); - if (cur_tp < mr->cur_tp) { - mi->max_tp_rate2 = mi->max_tp_rate; - cur_tp2 = cur_tp; - mi->max_tp_rate = mg->max_tp_rate; - cur_tp = mr->cur_tp; - mi->max_prob_streams = minstrel_mcs_groups[group].streams - 1; + /* Find max probability rate per group and global */ + minstrel_ht_set_best_prob_rate(mi, index); } - mr = minstrel_get_ratestats(mi, mg->max_tp_rate2); - if (cur_tp2 < mr->cur_tp) { - mi->max_tp_rate2 = mg->max_tp_rate2; - cur_tp2 = mr->cur_tp; - } + memcpy(mg->max_group_tp_rate, tmp_group_tp_rate, + sizeof(mg->max_group_tp_rate)); } - if (mi->max_prob_streams < 1) - mi->max_prob_streams = 1; + /* Assign new rate set per sta */ + minstrel_ht_assign_best_tp_rates(mi, tmp_mcs_tp_rate, tmp_cck_tp_rate); + memcpy(mi->max_tp_rate, tmp_mcs_tp_rate, sizeof(mi->max_tp_rate)); - for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { - mg = &mi->groups[group]; - if (!mg->supported) - continue; - mr = minstrel_get_ratestats(mi, mg->max_prob_rate); - if (cur_prob_tp < mr->cur_tp && - minstrel_mcs_groups[group].streams <= mi->max_prob_streams) { - mi->max_prob_rate = mg->max_prob_rate; - cur_prob = mr->cur_prob; - cur_prob_tp = mr->cur_tp; - } - } + /* Try to increase robustness of max_prob_rate*/ + minstrel_ht_prob_rate_reduce_streams(mi); + + /* try to sample all available rates during each interval */ + mi->sample_count *= 8; #ifdef CONFIG_MAC80211_DEBUGFS /* use fixed index if set */ if (mp->fixed_rate_idx != -1) { - mi->max_tp_rate = mp->fixed_rate_idx; - mi->max_tp_rate2 = mp->fixed_rate_idx; + for (i = 0; i < 4; i++) + mi->max_tp_rate[i] = mp->fixed_rate_idx; mi->max_prob_rate = mp->fixed_rate_idx; } #endif + /* Reset update timer */ mi->stats_update = jiffies; } @@ -420,8 +517,7 @@ minstrel_next_sample_idx(struct minstrel_ht_sta *mi) } static void -minstrel_downgrade_rate(struct minstrel_ht_sta *mi, unsigned int *idx, - bool primary) +minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u8 *idx, bool primary) { int group, orig_group; @@ -437,9 +533,9 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, unsigned int *idx, continue; if (primary) - *idx = mi->groups[group].max_tp_rate; + *idx = mi->groups[group].max_group_tp_rate[0]; else - *idx = mi->groups[group].max_tp_rate2; + *idx = mi->groups[group].max_group_tp_rate[1]; break; } } @@ -524,19 +620,19 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, * check for sudden death of spatial multiplexing, * downgrade to a lower number of streams if necessary. */ - rate = minstrel_get_ratestats(mi, mi->max_tp_rate); + rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); if (rate->attempts > 30 && MINSTREL_FRAC(rate->success, rate->attempts) < MINSTREL_FRAC(20, 100)) { - minstrel_downgrade_rate(mi, &mi->max_tp_rate, true); + minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true); update = true; } - rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate2); + rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]); if (rate2->attempts > 30 && MINSTREL_FRAC(rate2->success, rate2->attempts) < MINSTREL_FRAC(20, 100)) { - minstrel_downgrade_rate(mi, &mi->max_tp_rate2, false); + minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false); update = true; } @@ -661,12 +757,12 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) if (!rates) return; - /* Start with max_tp_rate */ - minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate); + /* Start with max_tp_rate[0] */ + minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[0]); if (mp->hw->max_rates >= 3) { - /* At least 3 tx rates supported, use max_tp_rate2 next */ - minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate2); + /* At least 3 tx rates supported, use max_tp_rate[1] next */ + minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[1]); } if (mp->hw->max_rates >= 2) { @@ -691,7 +787,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct minstrel_rate_stats *mr; struct minstrel_mcs_group_data *mg; - unsigned int sample_dur, sample_group; + unsigned int sample_dur, sample_group, cur_max_tp_streams; int sample_idx = 0; if (mi->sample_wait > 0) { @@ -718,8 +814,8 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) * to the frame. Hence, don't use sampling for the currently * used rates. */ - if (sample_idx == mi->max_tp_rate || - sample_idx == mi->max_tp_rate2 || + if (sample_idx == mi->max_tp_rate[0] || + sample_idx == mi->max_tp_rate[1] || sample_idx == mi->max_prob_rate) return -1; @@ -734,9 +830,12 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) * Make sure that lower rates get sampled only occasionally, * if the link is working perfectly. */ + + cur_max_tp_streams = minstrel_mcs_groups[mi->max_tp_rate[0] / + MCS_GROUP_RATES].streams; sample_dur = minstrel_get_duration(sample_idx); - if (sample_dur >= minstrel_get_duration(mi->max_tp_rate2) && - (mi->max_prob_streams < + if (sample_dur >= minstrel_get_duration(mi->max_tp_rate[1]) && + (cur_max_tp_streams - 1 < minstrel_mcs_groups[sample_group].streams || sample_dur >= minstrel_get_duration(mi->max_prob_rate))) { if (mr->sample_skipped < 20) @@ -1041,8 +1140,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) if (!msp->is_ht) return mac80211_minstrel.get_expected_throughput(priv_sta); - i = mi->max_tp_rate / MCS_GROUP_RATES; - j = mi->max_tp_rate % MCS_GROUP_RATES; + i = mi->max_tp_rate[0] / MCS_GROUP_RATES; + j = mi->max_tp_rate[0] % MCS_GROUP_RATES; /* convert cur_tp from pkt per second in kbps */ return mi->groups[i].rates[j].cur_tp * AVG_PKT_SIZE * 8 / 1024; diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index d655586773ac..01570e0e014b 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -26,28 +26,6 @@ struct mcs_group { extern const struct mcs_group minstrel_mcs_groups[]; -struct minstrel_rate_stats { - /* current / last sampling period attempts/success counters */ - unsigned int attempts, last_attempts; - unsigned int success, last_success; - - /* total attempts/success counters */ - u64 att_hist, succ_hist; - - /* current throughput */ - unsigned int cur_tp; - - /* packet delivery probabilities */ - unsigned int cur_prob, probability; - - /* maximum retry counts */ - unsigned int retry_count; - unsigned int retry_count_rtscts; - - bool retry_updated; - u8 sample_skipped; -}; - struct minstrel_mcs_group_data { u8 index; u8 column; @@ -55,10 +33,9 @@ struct minstrel_mcs_group_data { /* bitfield of supported MCS rates of this group */ u8 supported; - /* selected primary rates */ - unsigned int max_tp_rate; - unsigned int max_tp_rate2; - unsigned int max_prob_rate; + /* sorted rate set within a MCS group*/ + u8 max_group_tp_rate[MAX_THR_RATES]; + u8 max_group_prob_rate; /* MCS rate statistics */ struct minstrel_rate_stats rates[MCS_GROUP_RATES]; @@ -74,15 +51,9 @@ struct minstrel_ht_sta { /* ampdu length (EWMA) */ unsigned int avg_ampdu_len; - /* best throughput rate */ - unsigned int max_tp_rate; - - /* second best throughput rate */ - unsigned int max_tp_rate2; - - /* best probability rate */ - unsigned int max_prob_rate; - unsigned int max_prob_streams; + /* overall sorted rate set */ + u8 max_tp_rate[MAX_THR_RATES]; + u8 max_prob_rate; /* time of last status update */ unsigned long stats_update; diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 3e7d793de0c3..a72ad46f2a04 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -46,8 +46,10 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) else p += sprintf(p, "HT%c0/%cGI ", htmode, gimode); - *(p++) = (idx == mi->max_tp_rate) ? 'T' : ' '; - *(p++) = (idx == mi->max_tp_rate2) ? 't' : ' '; + *(p++) = (idx == mi->max_tp_rate[0]) ? 'A' : ' '; + *(p++) = (idx == mi->max_tp_rate[1]) ? 'B' : ' '; + *(p++) = (idx == mi->max_tp_rate[2]) ? 'C' : ' '; + *(p++) = (idx == mi->max_tp_rate[3]) ? 'D' : ' '; *(p++) = (idx == mi->max_prob_rate) ? 'P' : ' '; if (i == max_mcs) { @@ -100,8 +102,8 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) file->private_data = ms; p = ms->buf; - p += sprintf(p, "type rate throughput ewma prob this prob " - "retry this succ/attempt success attempts\n"); + p += sprintf(p, "type rate throughput ewma prob " + "this prob retry this succ/attempt success attempts\n"); p = minstrel_ht_stats_dump(mi, max_mcs, p); for (i = 0; i < max_mcs; i++) diff --git a/net/mac80211/rc80211_pid.h b/net/mac80211/rc80211_pid.h deleted file mode 100644 index 19111c7bf454..000000000000 --- a/net/mac80211/rc80211_pid.h +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Copyright 2007, Mattias Nissler <mattias.nissler@gmx.de> - * Copyright 2007, Stefano Brivio <stefano.brivio@polimi.it> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef RC80211_PID_H -#define RC80211_PID_H - -/* Sampling period for measuring percentage of failed frames in ms. */ -#define RC_PID_INTERVAL 125 - -/* Exponential averaging smoothness (used for I part of PID controller) */ -#define RC_PID_SMOOTHING_SHIFT 3 -#define RC_PID_SMOOTHING (1 << RC_PID_SMOOTHING_SHIFT) - -/* Sharpening factor (used for D part of PID controller) */ -#define RC_PID_SHARPENING_FACTOR 0 -#define RC_PID_SHARPENING_DURATION 0 - -/* Fixed point arithmetic shifting amount. */ -#define RC_PID_ARITH_SHIFT 8 - -/* Proportional PID component coefficient. */ -#define RC_PID_COEFF_P 15 -/* Integral PID component coefficient. */ -#define RC_PID_COEFF_I 9 -/* Derivative PID component coefficient. */ -#define RC_PID_COEFF_D 15 - -/* Target failed frames rate for the PID controller. NB: This effectively gives - * maximum failed frames percentage we're willing to accept. If the wireless - * link quality is good, the controller will fail to adjust failed frames - * percentage to the target. This is intentional. - */ -#define RC_PID_TARGET_PF 14 - -/* Rate behaviour normalization quantity over time. */ -#define RC_PID_NORM_OFFSET 3 - -/* Push high rates right after loading. */ -#define RC_PID_FAST_START 0 - -/* Arithmetic right shift for positive and negative values for ISO C. */ -#define RC_PID_DO_ARITH_RIGHT_SHIFT(x, y) \ - ((x) < 0 ? -((-(x)) >> (y)) : (x) >> (y)) - -enum rc_pid_event_type { - RC_PID_EVENT_TYPE_TX_STATUS, - RC_PID_EVENT_TYPE_RATE_CHANGE, - RC_PID_EVENT_TYPE_TX_RATE, - RC_PID_EVENT_TYPE_PF_SAMPLE, -}; - -union rc_pid_event_data { - /* RC_PID_EVENT_TX_STATUS */ - struct { - u32 flags; - struct ieee80211_tx_info tx_status; - }; - /* RC_PID_EVENT_TYPE_RATE_CHANGE */ - /* RC_PID_EVENT_TYPE_TX_RATE */ - struct { - int index; - int rate; - }; - /* RC_PID_EVENT_TYPE_PF_SAMPLE */ - struct { - s32 pf_sample; - s32 prop_err; - s32 int_err; - s32 der_err; - }; -}; - -struct rc_pid_event { - /* The time when the event occurred */ - unsigned long timestamp; - - /* Event ID number */ - unsigned int id; - - /* Type of event */ - enum rc_pid_event_type type; - - /* type specific data */ - union rc_pid_event_data data; -}; - -/* Size of the event ring buffer. */ -#define RC_PID_EVENT_RING_SIZE 32 - -struct rc_pid_event_buffer { - /* Counter that generates event IDs */ - unsigned int ev_count; - - /* Ring buffer of events */ - struct rc_pid_event ring[RC_PID_EVENT_RING_SIZE]; - - /* Index to the entry in events_buf to be reused */ - unsigned int next_entry; - - /* Lock that guards against concurrent access to this buffer struct */ - spinlock_t lock; - - /* Wait queue for poll/select and blocking I/O */ - wait_queue_head_t waitqueue; -}; - -struct rc_pid_events_file_info { - /* The event buffer we read */ - struct rc_pid_event_buffer *events; - - /* The entry we have should read next */ - unsigned int next_entry; -}; - -/** - * struct rc_pid_debugfs_entries - tunable parameters - * - * Algorithm parameters, tunable via debugfs. - * @target: target percentage for failed frames - * @sampling_period: error sampling interval in milliseconds - * @coeff_p: absolute value of the proportional coefficient - * @coeff_i: absolute value of the integral coefficient - * @coeff_d: absolute value of the derivative coefficient - * @smoothing_shift: absolute value of the integral smoothing factor (i.e. - * amount of smoothing introduced by the exponential moving average) - * @sharpen_factor: absolute value of the derivative sharpening factor (i.e. - * amount of emphasis given to the derivative term after low activity - * events) - * @sharpen_duration: duration of the sharpening effect after the detected low - * activity event, relative to sampling_period - * @norm_offset: amount of normalization periodically performed on the learnt - * rate behaviour values (lower means we should trust more what we learnt - * about behaviour of rates, higher means we should trust more the natural - * ordering of rates) - */ -struct rc_pid_debugfs_entries { - struct dentry *target; - struct dentry *sampling_period; - struct dentry *coeff_p; - struct dentry *coeff_i; - struct dentry *coeff_d; - struct dentry *smoothing_shift; - struct dentry *sharpen_factor; - struct dentry *sharpen_duration; - struct dentry *norm_offset; -}; - -void rate_control_pid_event_tx_status(struct rc_pid_event_buffer *buf, - struct ieee80211_tx_info *stat); - -void rate_control_pid_event_rate_change(struct rc_pid_event_buffer *buf, - int index, int rate); - -void rate_control_pid_event_tx_rate(struct rc_pid_event_buffer *buf, - int index, int rate); - -void rate_control_pid_event_pf_sample(struct rc_pid_event_buffer *buf, - s32 pf_sample, s32 prop_err, - s32 int_err, s32 der_err); - -void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta, - struct dentry *dir); - -void rate_control_pid_remove_sta_debugfs(void *priv, void *priv_sta); - -struct rc_pid_sta_info { - unsigned long last_change; - unsigned long last_sample; - - u32 tx_num_failed; - u32 tx_num_xmit; - - int txrate_idx; - - /* Average failed frames percentage error (i.e. actual vs. target - * percentage), scaled by RC_PID_SMOOTHING. This value is computed - * using using an exponential weighted average technique: - * - * (RC_PID_SMOOTHING - 1) * err_avg_old + err - * err_avg = ------------------------------------------ - * RC_PID_SMOOTHING - * - * where err_avg is the new approximation, err_avg_old the previous one - * and err is the error w.r.t. to the current failed frames percentage - * sample. Note that the bigger RC_PID_SMOOTHING the more weight is - * given to the previous estimate, resulting in smoother behavior (i.e. - * corresponding to a longer integration window). - * - * For computation, we actually don't use the above formula, but this - * one: - * - * err_avg_scaled = err_avg_old_scaled - err_avg_old + err - * - * where: - * err_avg_scaled = err * RC_PID_SMOOTHING - * err_avg_old_scaled = err_avg_old * RC_PID_SMOOTHING - * - * This avoids floating point numbers and the per_failed_old value can - * easily be obtained by shifting per_failed_old_scaled right by - * RC_PID_SMOOTHING_SHIFT. - */ - s32 err_avg_sc; - - /* Last framed failes percentage sample. */ - u32 last_pf; - - /* Sharpening needed. */ - u8 sharp_cnt; - -#ifdef CONFIG_MAC80211_DEBUGFS - /* Event buffer */ - struct rc_pid_event_buffer events; - - /* Events debugfs file entry */ - struct dentry *events_entry; -#endif -}; - -/* Algorithm parameters. We keep them on a per-algorithm approach, so they can - * be tuned individually for each interface. - */ -struct rc_pid_rateinfo { - - /* Map sorted rates to rates in ieee80211_hw_mode. */ - int index; - - /* Map rates in ieee80211_hw_mode to sorted rates. */ - int rev_index; - - /* Did we do any measurement on this rate? */ - bool valid; - - /* Comparison with the lowest rate. */ - int diff; -}; - -struct rc_pid_info { - - /* The failed frames percentage target. */ - unsigned int target; - - /* Rate at which failed frames percentage is sampled in 0.001s. */ - unsigned int sampling_period; - - /* P, I and D coefficients. */ - int coeff_p; - int coeff_i; - int coeff_d; - - /* Exponential averaging shift. */ - unsigned int smoothing_shift; - - /* Sharpening factor and duration. */ - unsigned int sharpen_factor; - unsigned int sharpen_duration; - - /* Normalization offset. */ - unsigned int norm_offset; - - /* Rates information. */ - struct rc_pid_rateinfo *rinfo; - - /* Index of the last used rate. */ - int oldrate; - -#ifdef CONFIG_MAC80211_DEBUGFS - /* Debugfs entries created for the parameters above. */ - struct rc_pid_debugfs_entries dentries; -#endif -}; - -#endif /* RC80211_PID_H */ diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c deleted file mode 100644 index d0da2a70fe68..000000000000 --- a/net/mac80211/rc80211_pid_algo.c +++ /dev/null @@ -1,478 +0,0 @@ -/* - * Copyright 2002-2005, Instant802 Networks, Inc. - * Copyright 2005, Devicescape Software, Inc. - * Copyright 2007, Mattias Nissler <mattias.nissler@gmx.de> - * Copyright 2007-2008, Stefano Brivio <stefano.brivio@polimi.it> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/netdevice.h> -#include <linux/types.h> -#include <linux/skbuff.h> -#include <linux/debugfs.h> -#include <linux/slab.h> -#include <net/mac80211.h> -#include "rate.h" -#include "mesh.h" -#include "rc80211_pid.h" - - -/* This is an implementation of a TX rate control algorithm that uses a PID - * controller. Given a target failed frames rate, the controller decides about - * TX rate changes to meet the target failed frames rate. - * - * The controller basically computes the following: - * - * adj = CP * err + CI * err_avg + CD * (err - last_err) * (1 + sharpening) - * - * where - * adj adjustment value that is used to switch TX rate (see below) - * err current error: target vs. current failed frames percentage - * last_err last error - * err_avg average (i.e. poor man's integral) of recent errors - * sharpening non-zero when fast response is needed (i.e. right after - * association or no frames sent for a long time), heading - * to zero over time - * CP Proportional coefficient - * CI Integral coefficient - * CD Derivative coefficient - * - * CP, CI, CD are subject to careful tuning. - * - * The integral component uses a exponential moving average approach instead of - * an actual sliding window. The advantage is that we don't need to keep an - * array of the last N error values and computation is easier. - * - * Once we have the adj value, we map it to a rate by means of a learning - * algorithm. This algorithm keeps the state of the percentual failed frames - * difference between rates. The behaviour of the lowest available rate is kept - * as a reference value, and every time we switch between two rates, we compute - * the difference between the failed frames each rate exhibited. By doing so, - * we compare behaviours which different rates exhibited in adjacent timeslices, - * thus the comparison is minimally affected by external conditions. This - * difference gets propagated to the whole set of measurements, so that the - * reference is always the same. Periodically, we normalize this set so that - * recent events weigh the most. By comparing the adj value with this set, we - * avoid pejorative switches to lower rates and allow for switches to higher - * rates if they behaved well. - * - * Note that for the computations we use a fixed-point representation to avoid - * floating point arithmetic. Hence, all values are shifted left by - * RC_PID_ARITH_SHIFT. - */ - - -/* Adjust the rate while ensuring that we won't switch to a lower rate if it - * exhibited a worse failed frames behaviour and we'll choose the highest rate - * whose failed frames behaviour is not worse than the one of the original rate - * target. While at it, check that the new rate is valid. */ -static void rate_control_pid_adjust_rate(struct ieee80211_supported_band *sband, - struct ieee80211_sta *sta, - struct rc_pid_sta_info *spinfo, int adj, - struct rc_pid_rateinfo *rinfo) -{ - int cur_sorted, new_sorted, probe, tmp, n_bitrates, band; - int cur = spinfo->txrate_idx; - - band = sband->band; - n_bitrates = sband->n_bitrates; - - /* Map passed arguments to sorted values. */ - cur_sorted = rinfo[cur].rev_index; - new_sorted = cur_sorted + adj; - - /* Check limits. */ - if (new_sorted < 0) - new_sorted = rinfo[0].rev_index; - else if (new_sorted >= n_bitrates) - new_sorted = rinfo[n_bitrates - 1].rev_index; - - tmp = new_sorted; - - if (adj < 0) { - /* Ensure that the rate decrease isn't disadvantageous. */ - for (probe = cur_sorted; probe >= new_sorted; probe--) - if (rinfo[probe].diff <= rinfo[cur_sorted].diff && - rate_supported(sta, band, rinfo[probe].index)) - tmp = probe; - } else { - /* Look for rate increase with zero (or below) cost. */ - for (probe = new_sorted + 1; probe < n_bitrates; probe++) - if (rinfo[probe].diff <= rinfo[new_sorted].diff && - rate_supported(sta, band, rinfo[probe].index)) - tmp = probe; - } - - /* Fit the rate found to the nearest supported rate. */ - do { - if (rate_supported(sta, band, rinfo[tmp].index)) { - spinfo->txrate_idx = rinfo[tmp].index; - break; - } - if (adj < 0) - tmp--; - else - tmp++; - } while (tmp < n_bitrates && tmp >= 0); - -#ifdef CONFIG_MAC80211_DEBUGFS - rate_control_pid_event_rate_change(&spinfo->events, - spinfo->txrate_idx, - sband->bitrates[spinfo->txrate_idx].bitrate); -#endif -} - -/* Normalize the failed frames per-rate differences. */ -static void rate_control_pid_normalize(struct rc_pid_info *pinfo, int l) -{ - int i, norm_offset = pinfo->norm_offset; - struct rc_pid_rateinfo *r = pinfo->rinfo; - - if (r[0].diff > norm_offset) - r[0].diff -= norm_offset; - else if (r[0].diff < -norm_offset) - r[0].diff += norm_offset; - for (i = 0; i < l - 1; i++) - if (r[i + 1].diff > r[i].diff + norm_offset) - r[i + 1].diff -= norm_offset; - else if (r[i + 1].diff <= r[i].diff) - r[i + 1].diff += norm_offset; -} - -static void rate_control_pid_sample(struct rc_pid_info *pinfo, - struct ieee80211_supported_band *sband, - struct ieee80211_sta *sta, - struct rc_pid_sta_info *spinfo) -{ - struct rc_pid_rateinfo *rinfo = pinfo->rinfo; - u32 pf; - s32 err_avg; - u32 err_prop; - u32 err_int; - u32 err_der; - int adj, i, j, tmp; - unsigned long period; - - /* In case nothing happened during the previous control interval, turn - * the sharpening factor on. */ - period = msecs_to_jiffies(pinfo->sampling_period); - if (jiffies - spinfo->last_sample > 2 * period) - spinfo->sharp_cnt = pinfo->sharpen_duration; - - spinfo->last_sample = jiffies; - - /* This should never happen, but in case, we assume the old sample is - * still a good measurement and copy it. */ - if (unlikely(spinfo->tx_num_xmit == 0)) - pf = spinfo->last_pf; - else - pf = spinfo->tx_num_failed * 100 / spinfo->tx_num_xmit; - - spinfo->tx_num_xmit = 0; - spinfo->tx_num_failed = 0; - - /* If we just switched rate, update the rate behaviour info. */ - if (pinfo->oldrate != spinfo->txrate_idx) { - - i = rinfo[pinfo->oldrate].rev_index; - j = rinfo[spinfo->txrate_idx].rev_index; - - tmp = (pf - spinfo->last_pf); - tmp = RC_PID_DO_ARITH_RIGHT_SHIFT(tmp, RC_PID_ARITH_SHIFT); - - rinfo[j].diff = rinfo[i].diff + tmp; - pinfo->oldrate = spinfo->txrate_idx; - } - rate_control_pid_normalize(pinfo, sband->n_bitrates); - - /* Compute the proportional, integral and derivative errors. */ - err_prop = (pinfo->target - pf) << RC_PID_ARITH_SHIFT; - - err_avg = spinfo->err_avg_sc >> pinfo->smoothing_shift; - spinfo->err_avg_sc = spinfo->err_avg_sc - err_avg + err_prop; - err_int = spinfo->err_avg_sc >> pinfo->smoothing_shift; - - err_der = (pf - spinfo->last_pf) * - (1 + pinfo->sharpen_factor * spinfo->sharp_cnt); - spinfo->last_pf = pf; - if (spinfo->sharp_cnt) - spinfo->sharp_cnt--; - -#ifdef CONFIG_MAC80211_DEBUGFS - rate_control_pid_event_pf_sample(&spinfo->events, pf, err_prop, err_int, - err_der); -#endif - - /* Compute the controller output. */ - adj = (err_prop * pinfo->coeff_p + err_int * pinfo->coeff_i - + err_der * pinfo->coeff_d); - adj = RC_PID_DO_ARITH_RIGHT_SHIFT(adj, 2 * RC_PID_ARITH_SHIFT); - - /* Change rate. */ - if (adj) - rate_control_pid_adjust_rate(sband, sta, spinfo, adj, rinfo); -} - -static void rate_control_pid_tx_status(void *priv, struct ieee80211_supported_band *sband, - struct ieee80211_sta *sta, void *priv_sta, - struct sk_buff *skb) -{ - struct rc_pid_info *pinfo = priv; - struct rc_pid_sta_info *spinfo = priv_sta; - unsigned long period; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - - if (!spinfo) - return; - - /* Ignore all frames that were sent with a different rate than the rate - * we currently advise mac80211 to use. */ - if (info->status.rates[0].idx != spinfo->txrate_idx) - return; - - spinfo->tx_num_xmit++; - -#ifdef CONFIG_MAC80211_DEBUGFS - rate_control_pid_event_tx_status(&spinfo->events, info); -#endif - - /* We count frames that totally failed to be transmitted as two bad - * frames, those that made it out but had some retries as one good and - * one bad frame. */ - if (!(info->flags & IEEE80211_TX_STAT_ACK)) { - spinfo->tx_num_failed += 2; - spinfo->tx_num_xmit++; - } else if (info->status.rates[0].count > 1) { - spinfo->tx_num_failed++; - spinfo->tx_num_xmit++; - } - - /* Update PID controller state. */ - period = msecs_to_jiffies(pinfo->sampling_period); - if (time_after(jiffies, spinfo->last_sample + period)) - rate_control_pid_sample(pinfo, sband, sta, spinfo); -} - -static void -rate_control_pid_get_rate(void *priv, struct ieee80211_sta *sta, - void *priv_sta, - struct ieee80211_tx_rate_control *txrc) -{ - struct sk_buff *skb = txrc->skb; - struct ieee80211_supported_band *sband = txrc->sband; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct rc_pid_sta_info *spinfo = priv_sta; - int rateidx; - - if (txrc->rts) - info->control.rates[0].count = - txrc->hw->conf.long_frame_max_tx_count; - else - info->control.rates[0].count = - txrc->hw->conf.short_frame_max_tx_count; - - /* Send management frames and NO_ACK data using lowest rate. */ - if (rate_control_send_low(sta, priv_sta, txrc)) - return; - - rateidx = spinfo->txrate_idx; - - if (rateidx >= sband->n_bitrates) - rateidx = sband->n_bitrates - 1; - - info->control.rates[0].idx = rateidx; - -#ifdef CONFIG_MAC80211_DEBUGFS - rate_control_pid_event_tx_rate(&spinfo->events, - rateidx, sband->bitrates[rateidx].bitrate); -#endif -} - -static void -rate_control_pid_rate_init(void *priv, struct ieee80211_supported_band *sband, - struct cfg80211_chan_def *chandef, - struct ieee80211_sta *sta, void *priv_sta) -{ - struct rc_pid_sta_info *spinfo = priv_sta; - struct rc_pid_info *pinfo = priv; - struct rc_pid_rateinfo *rinfo = pinfo->rinfo; - int i, j, tmp; - bool s; - - /* TODO: This routine should consider using RSSI from previous packets - * as we need to have IEEE 802.1X auth succeed immediately after assoc.. - * Until that method is implemented, we will use the lowest supported - * rate as a workaround. */ - - /* Sort the rates. This is optimized for the most common case (i.e. - * almost-sorted CCK+OFDM rates). Kind of bubble-sort with reversed - * mapping too. */ - for (i = 0; i < sband->n_bitrates; i++) { - rinfo[i].index = i; - rinfo[i].rev_index = i; - if (RC_PID_FAST_START) - rinfo[i].diff = 0; - else - rinfo[i].diff = i * pinfo->norm_offset; - } - for (i = 1; i < sband->n_bitrates; i++) { - s = false; - for (j = 0; j < sband->n_bitrates - i; j++) - if (unlikely(sband->bitrates[rinfo[j].index].bitrate > - sband->bitrates[rinfo[j + 1].index].bitrate)) { - tmp = rinfo[j].index; - rinfo[j].index = rinfo[j + 1].index; - rinfo[j + 1].index = tmp; - rinfo[rinfo[j].index].rev_index = j; - rinfo[rinfo[j + 1].index].rev_index = j + 1; - s = true; - } - if (!s) - break; - } - - spinfo->txrate_idx = rate_lowest_index(sband, sta); -} - -static void *rate_control_pid_alloc(struct ieee80211_hw *hw, - struct dentry *debugfsdir) -{ - struct rc_pid_info *pinfo; - struct rc_pid_rateinfo *rinfo; - struct ieee80211_supported_band *sband; - int i, max_rates = 0; -#ifdef CONFIG_MAC80211_DEBUGFS - struct rc_pid_debugfs_entries *de; -#endif - - pinfo = kmalloc(sizeof(*pinfo), GFP_ATOMIC); - if (!pinfo) - return NULL; - - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { - sband = hw->wiphy->bands[i]; - if (sband && sband->n_bitrates > max_rates) - max_rates = sband->n_bitrates; - } - - rinfo = kmalloc(sizeof(*rinfo) * max_rates, GFP_ATOMIC); - if (!rinfo) { - kfree(pinfo); - return NULL; - } - - pinfo->target = RC_PID_TARGET_PF; - pinfo->sampling_period = RC_PID_INTERVAL; - pinfo->coeff_p = RC_PID_COEFF_P; - pinfo->coeff_i = RC_PID_COEFF_I; - pinfo->coeff_d = RC_PID_COEFF_D; - pinfo->smoothing_shift = RC_PID_SMOOTHING_SHIFT; - pinfo->sharpen_factor = RC_PID_SHARPENING_FACTOR; - pinfo->sharpen_duration = RC_PID_SHARPENING_DURATION; - pinfo->norm_offset = RC_PID_NORM_OFFSET; - pinfo->rinfo = rinfo; - pinfo->oldrate = 0; - -#ifdef CONFIG_MAC80211_DEBUGFS - de = &pinfo->dentries; - de->target = debugfs_create_u32("target_pf", S_IRUSR | S_IWUSR, - debugfsdir, &pinfo->target); - de->sampling_period = debugfs_create_u32("sampling_period", - S_IRUSR | S_IWUSR, debugfsdir, - &pinfo->sampling_period); - de->coeff_p = debugfs_create_u32("coeff_p", S_IRUSR | S_IWUSR, - debugfsdir, (u32 *)&pinfo->coeff_p); - de->coeff_i = debugfs_create_u32("coeff_i", S_IRUSR | S_IWUSR, - debugfsdir, (u32 *)&pinfo->coeff_i); - de->coeff_d = debugfs_create_u32("coeff_d", S_IRUSR | S_IWUSR, - debugfsdir, (u32 *)&pinfo->coeff_d); - de->smoothing_shift = debugfs_create_u32("smoothing_shift", - S_IRUSR | S_IWUSR, debugfsdir, - &pinfo->smoothing_shift); - de->sharpen_factor = debugfs_create_u32("sharpen_factor", - S_IRUSR | S_IWUSR, debugfsdir, - &pinfo->sharpen_factor); - de->sharpen_duration = debugfs_create_u32("sharpen_duration", - S_IRUSR | S_IWUSR, debugfsdir, - &pinfo->sharpen_duration); - de->norm_offset = debugfs_create_u32("norm_offset", - S_IRUSR | S_IWUSR, debugfsdir, - &pinfo->norm_offset); -#endif - - return pinfo; -} - -static void rate_control_pid_free(void *priv) -{ - struct rc_pid_info *pinfo = priv; -#ifdef CONFIG_MAC80211_DEBUGFS - struct rc_pid_debugfs_entries *de = &pinfo->dentries; - - debugfs_remove(de->norm_offset); - debugfs_remove(de->sharpen_duration); - debugfs_remove(de->sharpen_factor); - debugfs_remove(de->smoothing_shift); - debugfs_remove(de->coeff_d); - debugfs_remove(de->coeff_i); - debugfs_remove(de->coeff_p); - debugfs_remove(de->sampling_period); - debugfs_remove(de->target); -#endif - - kfree(pinfo->rinfo); - kfree(pinfo); -} - -static void *rate_control_pid_alloc_sta(void *priv, struct ieee80211_sta *sta, - gfp_t gfp) -{ - struct rc_pid_sta_info *spinfo; - - spinfo = kzalloc(sizeof(*spinfo), gfp); - if (spinfo == NULL) - return NULL; - - spinfo->last_sample = jiffies; - -#ifdef CONFIG_MAC80211_DEBUGFS - spin_lock_init(&spinfo->events.lock); - init_waitqueue_head(&spinfo->events.waitqueue); -#endif - - return spinfo; -} - -static void rate_control_pid_free_sta(void *priv, struct ieee80211_sta *sta, - void *priv_sta) -{ - kfree(priv_sta); -} - -static const struct rate_control_ops mac80211_rcpid = { - .name = "pid", - .tx_status = rate_control_pid_tx_status, - .get_rate = rate_control_pid_get_rate, - .rate_init = rate_control_pid_rate_init, - .alloc = rate_control_pid_alloc, - .free = rate_control_pid_free, - .alloc_sta = rate_control_pid_alloc_sta, - .free_sta = rate_control_pid_free_sta, -#ifdef CONFIG_MAC80211_DEBUGFS - .add_sta_debugfs = rate_control_pid_add_sta_debugfs, - .remove_sta_debugfs = rate_control_pid_remove_sta_debugfs, -#endif -}; - -int __init rc80211_pid_init(void) -{ - return ieee80211_rate_control_register(&mac80211_rcpid); -} - -void rc80211_pid_exit(void) -{ - ieee80211_rate_control_unregister(&mac80211_rcpid); -} diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c deleted file mode 100644 index 6ff134650a84..000000000000 --- a/net/mac80211/rc80211_pid_debugfs.c +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Copyright 2007, Mattias Nissler <mattias.nissler@gmx.de> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/sched.h> -#include <linux/spinlock.h> -#include <linux/poll.h> -#include <linux/netdevice.h> -#include <linux/types.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/export.h> - -#include <net/mac80211.h> -#include "rate.h" - -#include "rc80211_pid.h" - -static void rate_control_pid_event(struct rc_pid_event_buffer *buf, - enum rc_pid_event_type type, - union rc_pid_event_data *data) -{ - struct rc_pid_event *ev; - unsigned long status; - - spin_lock_irqsave(&buf->lock, status); - ev = &(buf->ring[buf->next_entry]); - buf->next_entry = (buf->next_entry + 1) % RC_PID_EVENT_RING_SIZE; - - ev->timestamp = jiffies; - ev->id = buf->ev_count++; - ev->type = type; - ev->data = *data; - - spin_unlock_irqrestore(&buf->lock, status); - - wake_up_all(&buf->waitqueue); -} - -void rate_control_pid_event_tx_status(struct rc_pid_event_buffer *buf, - struct ieee80211_tx_info *stat) -{ - union rc_pid_event_data evd; - - evd.flags = stat->flags; - memcpy(&evd.tx_status, stat, sizeof(struct ieee80211_tx_info)); - rate_control_pid_event(buf, RC_PID_EVENT_TYPE_TX_STATUS, &evd); -} - -void rate_control_pid_event_rate_change(struct rc_pid_event_buffer *buf, - int index, int rate) -{ - union rc_pid_event_data evd; - - evd.index = index; - evd.rate = rate; - rate_control_pid_event(buf, RC_PID_EVENT_TYPE_RATE_CHANGE, &evd); -} - -void rate_control_pid_event_tx_rate(struct rc_pid_event_buffer *buf, - int index, int rate) -{ - union rc_pid_event_data evd; - - evd.index = index; - evd.rate = rate; - rate_control_pid_event(buf, RC_PID_EVENT_TYPE_TX_RATE, &evd); -} - -void rate_control_pid_event_pf_sample(struct rc_pid_event_buffer *buf, - s32 pf_sample, s32 prop_err, - s32 int_err, s32 der_err) -{ - union rc_pid_event_data evd; - - evd.pf_sample = pf_sample; - evd.prop_err = prop_err; - evd.int_err = int_err; - evd.der_err = der_err; - rate_control_pid_event(buf, RC_PID_EVENT_TYPE_PF_SAMPLE, &evd); -} - -static int rate_control_pid_events_open(struct inode *inode, struct file *file) -{ - struct rc_pid_sta_info *sinfo = inode->i_private; - struct rc_pid_event_buffer *events = &sinfo->events; - struct rc_pid_events_file_info *file_info; - unsigned long status; - - /* Allocate a state struct */ - file_info = kmalloc(sizeof(*file_info), GFP_KERNEL); - if (file_info == NULL) - return -ENOMEM; - - spin_lock_irqsave(&events->lock, status); - - file_info->next_entry = events->next_entry; - file_info->events = events; - - spin_unlock_irqrestore(&events->lock, status); - - file->private_data = file_info; - - return 0; -} - -static int rate_control_pid_events_release(struct inode *inode, - struct file *file) -{ - struct rc_pid_events_file_info *file_info = file->private_data; - - kfree(file_info); - - return 0; -} - -static unsigned int rate_control_pid_events_poll(struct file *file, - poll_table *wait) -{ - struct rc_pid_events_file_info *file_info = file->private_data; - - poll_wait(file, &file_info->events->waitqueue, wait); - - return POLLIN | POLLRDNORM; -} - -#define RC_PID_PRINT_BUF_SIZE 64 - -static ssize_t rate_control_pid_events_read(struct file *file, char __user *buf, - size_t length, loff_t *offset) -{ - struct rc_pid_events_file_info *file_info = file->private_data; - struct rc_pid_event_buffer *events = file_info->events; - struct rc_pid_event *ev; - char pb[RC_PID_PRINT_BUF_SIZE]; - int ret; - int p; - unsigned long status; - - /* Check if there is something to read. */ - if (events->next_entry == file_info->next_entry) { - if (file->f_flags & O_NONBLOCK) - return -EAGAIN; - - /* Wait */ - ret = wait_event_interruptible(events->waitqueue, - events->next_entry != file_info->next_entry); - - if (ret) - return ret; - } - - /* Write out one event per call. I don't care whether it's a little - * inefficient, this is debugging code anyway. */ - spin_lock_irqsave(&events->lock, status); - - /* Get an event */ - ev = &(events->ring[file_info->next_entry]); - file_info->next_entry = (file_info->next_entry + 1) % - RC_PID_EVENT_RING_SIZE; - - /* Print information about the event. Note that userspace needs to - * provide large enough buffers. */ - length = length < RC_PID_PRINT_BUF_SIZE ? - length : RC_PID_PRINT_BUF_SIZE; - p = scnprintf(pb, length, "%u %lu ", ev->id, ev->timestamp); - switch (ev->type) { - case RC_PID_EVENT_TYPE_TX_STATUS: - p += scnprintf(pb + p, length - p, "tx_status %u %u", - !(ev->data.flags & IEEE80211_TX_STAT_ACK), - ev->data.tx_status.status.rates[0].idx); - break; - case RC_PID_EVENT_TYPE_RATE_CHANGE: - p += scnprintf(pb + p, length - p, "rate_change %d %d", - ev->data.index, ev->data.rate); - break; - case RC_PID_EVENT_TYPE_TX_RATE: - p += scnprintf(pb + p, length - p, "tx_rate %d %d", - ev->data.index, ev->data.rate); - break; - case RC_PID_EVENT_TYPE_PF_SAMPLE: - p += scnprintf(pb + p, length - p, - "pf_sample %d %d %d %d", - ev->data.pf_sample, ev->data.prop_err, - ev->data.int_err, ev->data.der_err); - break; - } - p += scnprintf(pb + p, length - p, "\n"); - - spin_unlock_irqrestore(&events->lock, status); - - if (copy_to_user(buf, pb, p)) - return -EFAULT; - - return p; -} - -#undef RC_PID_PRINT_BUF_SIZE - -static const struct file_operations rc_pid_fop_events = { - .owner = THIS_MODULE, - .read = rate_control_pid_events_read, - .poll = rate_control_pid_events_poll, - .open = rate_control_pid_events_open, - .release = rate_control_pid_events_release, - .llseek = noop_llseek, -}; - -void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta, - struct dentry *dir) -{ - struct rc_pid_sta_info *spinfo = priv_sta; - - spinfo->events_entry = debugfs_create_file("rc_pid_events", S_IRUGO, - dir, spinfo, - &rc_pid_fop_events); -} - -void rate_control_pid_remove_sta_debugfs(void *priv, void *priv_sta) -{ - struct rc_pid_sta_info *spinfo = priv_sta; - - debugfs_remove(spinfo->events_entry); -} diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 394e201cde6d..b04ca4049c95 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -688,20 +689,27 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata, int index, struct sk_buff_head *frames) { - struct sk_buff *skb = tid_agg_rx->reorder_buf[index]; + struct sk_buff_head *skb_list = &tid_agg_rx->reorder_buf[index]; + struct sk_buff *skb; struct ieee80211_rx_status *status; lockdep_assert_held(&tid_agg_rx->reorder_lock); - if (!skb) + if (skb_queue_empty(skb_list)) + goto no_frame; + + if (!ieee80211_rx_reorder_ready(skb_list)) { + __skb_queue_purge(skb_list); goto no_frame; + } - /* release the frame from the reorder ring buffer */ + /* release frames from the reorder ring buffer */ tid_agg_rx->stored_mpdu_num--; - tid_agg_rx->reorder_buf[index] = NULL; - status = IEEE80211_SKB_RXCB(skb); - status->rx_flags |= IEEE80211_RX_DEFERRED_RELEASE; - __skb_queue_tail(frames, skb); + while ((skb = __skb_dequeue(skb_list))) { + status = IEEE80211_SKB_RXCB(skb); + status->rx_flags |= IEEE80211_RX_DEFERRED_RELEASE; + __skb_queue_tail(frames, skb); + } no_frame: tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num); @@ -738,13 +746,13 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_rx *tid_agg_rx, struct sk_buff_head *frames) { - int index, j; + int index, i, j; lockdep_assert_held(&tid_agg_rx->reorder_lock); /* release the buffer until next missing frame */ index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size; - if (!tid_agg_rx->reorder_buf[index] && + if (!ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index]) && tid_agg_rx->stored_mpdu_num) { /* * No buffers ready to be released, but check whether any @@ -753,7 +761,8 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, int skipped = 1; for (j = (index + 1) % tid_agg_rx->buf_size; j != index; j = (j + 1) % tid_agg_rx->buf_size) { - if (!tid_agg_rx->reorder_buf[j]) { + if (!ieee80211_rx_reorder_ready( + &tid_agg_rx->reorder_buf[j])) { skipped++; continue; } @@ -762,6 +771,11 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, HT_RX_REORDER_BUF_TIMEOUT)) goto set_release_timer; + /* don't leave incomplete A-MSDUs around */ + for (i = (index + 1) % tid_agg_rx->buf_size; i != j; + i = (i + 1) % tid_agg_rx->buf_size) + __skb_queue_purge(&tid_agg_rx->reorder_buf[i]); + ht_dbg_ratelimited(sdata, "release an RX reorder frame due to timeout on earlier frames\n"); ieee80211_release_reorder_frame(sdata, tid_agg_rx, j, @@ -775,7 +789,8 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, skipped) & IEEE80211_SN_MASK; skipped = 0; } - } else while (tid_agg_rx->reorder_buf[index]) { + } else while (ieee80211_rx_reorder_ready( + &tid_agg_rx->reorder_buf[index])) { ieee80211_release_reorder_frame(sdata, tid_agg_rx, index, frames); index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size; @@ -786,7 +801,8 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, for (; j != (index - 1) % tid_agg_rx->buf_size; j = (j + 1) % tid_agg_rx->buf_size) { - if (tid_agg_rx->reorder_buf[j]) + if (ieee80211_rx_reorder_ready( + &tid_agg_rx->reorder_buf[j])) break; } @@ -811,6 +827,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata struct sk_buff_head *frames) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); u16 sc = le16_to_cpu(hdr->seq_ctrl); u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4; u16 head_seq_num, buf_size; @@ -819,6 +836,16 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata spin_lock(&tid_agg_rx->reorder_lock); + /* + * Offloaded BA sessions have no known starting sequence number so pick + * one from first Rxed frame for this tid after BA was started. + */ + if (unlikely(tid_agg_rx->auto_seq)) { + tid_agg_rx->auto_seq = false; + tid_agg_rx->ssn = mpdu_seq_num; + tid_agg_rx->head_seq_num = mpdu_seq_num; + } + buf_size = tid_agg_rx->buf_size; head_seq_num = tid_agg_rx->head_seq_num; @@ -845,7 +872,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata index = mpdu_seq_num % tid_agg_rx->buf_size; /* check if we already stored this frame */ - if (tid_agg_rx->reorder_buf[index]) { + if (ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index])) { dev_kfree_skb(skb); goto out; } @@ -858,17 +885,20 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata */ if (mpdu_seq_num == tid_agg_rx->head_seq_num && tid_agg_rx->stored_mpdu_num == 0) { - tid_agg_rx->head_seq_num = - ieee80211_sn_inc(tid_agg_rx->head_seq_num); + if (!(status->flag & RX_FLAG_AMSDU_MORE)) + tid_agg_rx->head_seq_num = + ieee80211_sn_inc(tid_agg_rx->head_seq_num); ret = false; goto out; } /* put the frame in the reordering buffer */ - tid_agg_rx->reorder_buf[index] = skb; - tid_agg_rx->reorder_time[index] = jiffies; - tid_agg_rx->stored_mpdu_num++; - ieee80211_sta_reorder_release(sdata, tid_agg_rx, frames); + __skb_queue_tail(&tid_agg_rx->reorder_buf[index], skb); + if (!(status->flag & RX_FLAG_AMSDU_MORE)) { + tid_agg_rx->reorder_time[index] = jiffies; + tid_agg_rx->stored_mpdu_num++; + ieee80211_sta_reorder_release(sdata, tid_agg_rx, frames); + } out: spin_unlock(&tid_agg_rx->reorder_lock); @@ -1107,6 +1137,8 @@ static void sta_ps_end(struct sta_info *sta) return; } + set_sta_flag(sta, WLAN_STA_PS_DELIVER); + clear_sta_flag(sta, WLAN_STA_PS_STA); ieee80211_sta_ps_deliver_wakeup(sta); } @@ -2704,7 +2736,7 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) sig = status->signal; if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, - rx->skb->data, rx->skb->len, 0, GFP_ATOMIC)) { + rx->skb->data, rx->skb->len, 0)) { if (rx->sta) rx->sta->rx_packets++; dev_kfree_skb(rx->skb); @@ -3127,6 +3159,14 @@ static bool prepare_for_handlers(struct ieee80211_rx_data *rx, if (!ieee80211_is_beacon(hdr->frame_control)) return false; status->rx_flags &= ~IEEE80211_RX_RA_MATCH; + } else if (!ieee80211_has_tods(hdr->frame_control)) { + /* ignore data frames to TDLS-peers */ + if (ieee80211_is_data(hdr->frame_control)) + return false; + /* ignore action frames to TDLS-peers */ + if (ieee80211_is_action(hdr->frame_control) && + !ether_addr_equal(bssid, hdr->addr1)) + return false; } break; case NL80211_IFTYPE_WDS: diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index f40661eb75b5..af0d094b2f2f 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -6,6 +6,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -235,38 +236,51 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) { struct cfg80211_scan_request *req = local->scan_req; struct cfg80211_chan_def chandef; - enum ieee80211_band band; + u8 bands_used = 0; int i, ielen, n_chans; if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) return false; - do { - if (local->hw_scan_band == IEEE80211_NUM_BANDS) - return false; - - band = local->hw_scan_band; - n_chans = 0; + if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) { for (i = 0; i < req->n_channels; i++) { - if (req->channels[i]->band == band) { - local->hw_scan_req->channels[n_chans] = + local->hw_scan_req->req.channels[i] = req->channels[i]; + bands_used |= BIT(req->channels[i]->band); + } + + n_chans = req->n_channels; + } else { + do { + if (local->hw_scan_band == IEEE80211_NUM_BANDS) + return false; + + n_chans = 0; + + for (i = 0; i < req->n_channels; i++) { + if (req->channels[i]->band != + local->hw_scan_band) + continue; + local->hw_scan_req->req.channels[n_chans] = req->channels[i]; n_chans++; + bands_used |= BIT(req->channels[i]->band); } - } - local->hw_scan_band++; - } while (!n_chans); + local->hw_scan_band++; + } while (!n_chans); + } - local->hw_scan_req->n_channels = n_chans; + local->hw_scan_req->req.n_channels = n_chans; ieee80211_prepare_scan_chandef(&chandef, req->scan_width); - ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->ie, + ielen = ieee80211_build_preq_ies(local, + (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, - req->ie, req->ie_len, band, - req->rates[band], &chandef); - local->hw_scan_req->ie_len = ielen; - local->hw_scan_req->no_cck = req->no_cck; + &local->hw_scan_req->ies, + req->ie, req->ie_len, + bands_used, req->rates, &chandef); + local->hw_scan_req->req.ie_len = ielen; + local->hw_scan_req->req.no_cck = req->no_cck; return true; } @@ -291,7 +305,9 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) if (WARN_ON(!local->scan_req)) return; - if (hw_scan && !aborted && ieee80211_prep_hw_scan(local)) { + if (hw_scan && !aborted && + !(local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) && + ieee80211_prep_hw_scan(local)) { int rc; rc = drv_hw_scan(local, @@ -473,6 +489,21 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, u8 *ies; local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; + + if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) { + int i, n_bands = 0; + u8 bands_counted = 0; + + for (i = 0; i < req->n_channels; i++) { + if (bands_counted & BIT(req->channels[i]->band)) + continue; + bands_counted |= BIT(req->channels[i]->band); + n_bands++; + } + + local->hw_scan_ies_bufsize *= n_bands; + } + local->hw_scan_req = kmalloc( sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]) + @@ -480,13 +511,13 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, if (!local->hw_scan_req) return -ENOMEM; - local->hw_scan_req->ssids = req->ssids; - local->hw_scan_req->n_ssids = req->n_ssids; + local->hw_scan_req->req.ssids = req->ssids; + local->hw_scan_req->req.n_ssids = req->n_ssids; ies = (u8 *)local->hw_scan_req + sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]); - local->hw_scan_req->ie = ies; - local->hw_scan_req->flags = req->flags; + local->hw_scan_req->req.ie = ies; + local->hw_scan_req->req.flags = req->flags; local->hw_scan_band = 0; @@ -973,9 +1004,13 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; - struct ieee80211_sched_scan_ies sched_scan_ies = {}; + struct ieee80211_scan_ies sched_scan_ies = {}; struct cfg80211_chan_def chandef; - int ret, i, iebufsz; + int ret, i, iebufsz, num_bands = 0; + u32 rate_masks[IEEE80211_NUM_BANDS] = {}; + u8 bands_used = 0; + u8 *ie; + size_t len; iebufsz = local->scan_ies_len + req->ie_len; @@ -985,33 +1020,35 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, return -ENOTSUPP; for (i = 0; i < IEEE80211_NUM_BANDS; i++) { - if (!local->hw.wiphy->bands[i]) - continue; - - sched_scan_ies.ie[i] = kzalloc(iebufsz, GFP_KERNEL); - if (!sched_scan_ies.ie[i]) { - ret = -ENOMEM; - goto out_free; + if (local->hw.wiphy->bands[i]) { + bands_used |= BIT(i); + rate_masks[i] = (u32) -1; + num_bands++; } + } - ieee80211_prepare_scan_chandef(&chandef, req->scan_width); - - sched_scan_ies.len[i] = - ieee80211_build_preq_ies(local, sched_scan_ies.ie[i], - iebufsz, req->ie, req->ie_len, - i, (u32) -1, &chandef); + ie = kzalloc(num_bands * iebufsz, GFP_KERNEL); + if (!ie) { + ret = -ENOMEM; + goto out; } + ieee80211_prepare_scan_chandef(&chandef, req->scan_width); + + len = ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, + &sched_scan_ies, req->ie, + req->ie_len, bands_used, + rate_masks, &chandef); + ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { rcu_assign_pointer(local->sched_scan_sdata, sdata); local->sched_scan_req = req; } -out_free: - while (i > 0) - kfree(sched_scan_ies.ie[--i]); + kfree(ie); +out: if (ret) { /* Clean in case of failure after HW restart or upon resume. */ RCU_INIT_POINTER(local->sched_scan_sdata, NULL); @@ -1058,7 +1095,7 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata) if (rcu_access_pointer(local->sched_scan_sdata)) { ret = drv_sched_scan_stop(local, sdata); if (!ret) - rcu_assign_pointer(local->sched_scan_sdata, NULL); + RCU_INIT_POINTER(local->sched_scan_sdata, NULL); } out: mutex_unlock(&local->mtx); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index a9b46d8ea22f..de494df3bab8 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1,6 +1,7 @@ /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -100,7 +101,8 @@ static void __cleanup_single_sta(struct sta_info *sta) struct ps_data *ps; if (test_sta_flag(sta, WLAN_STA_PS_STA) || - test_sta_flag(sta, WLAN_STA_PS_DRIVER)) { + test_sta_flag(sta, WLAN_STA_PS_DRIVER) || + test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ps = &sdata->bss->ps; @@ -111,6 +113,7 @@ static void __cleanup_single_sta(struct sta_info *sta) clear_sta_flag(sta, WLAN_STA_PS_STA); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); + clear_sta_flag(sta, WLAN_STA_PS_DELIVER); atomic_dec(&ps->num_sta_ps); sta_info_recalc_tim(sta); @@ -125,7 +128,7 @@ static void __cleanup_single_sta(struct sta_info *sta) if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_sta_cleanup(sta); - cancel_work_sync(&sta->drv_unblock_wk); + cancel_work_sync(&sta->drv_deliver_wk); /* * Destroy aggregation state here. It would be nice to wait for the @@ -253,33 +256,23 @@ static void sta_info_hash_add(struct ieee80211_local *local, rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], sta); } -static void sta_unblock(struct work_struct *wk) +static void sta_deliver_ps_frames(struct work_struct *wk) { struct sta_info *sta; - sta = container_of(wk, struct sta_info, drv_unblock_wk); + sta = container_of(wk, struct sta_info, drv_deliver_wk); if (sta->dead) return; - if (!test_sta_flag(sta, WLAN_STA_PS_STA)) { - local_bh_disable(); + local_bh_disable(); + if (!test_sta_flag(sta, WLAN_STA_PS_STA)) ieee80211_sta_ps_deliver_wakeup(sta); - local_bh_enable(); - } else if (test_and_clear_sta_flag(sta, WLAN_STA_PSPOLL)) { - clear_sta_flag(sta, WLAN_STA_PS_DRIVER); - - local_bh_disable(); + else if (test_and_clear_sta_flag(sta, WLAN_STA_PSPOLL)) ieee80211_sta_ps_deliver_poll_response(sta); - local_bh_enable(); - } else if (test_and_clear_sta_flag(sta, WLAN_STA_UAPSD)) { - clear_sta_flag(sta, WLAN_STA_PS_DRIVER); - - local_bh_disable(); + else if (test_and_clear_sta_flag(sta, WLAN_STA_UAPSD)) ieee80211_sta_ps_deliver_uapsd(sta); - local_bh_enable(); - } else - clear_sta_flag(sta, WLAN_STA_PS_DRIVER); + local_bh_enable(); } static int sta_prepare_rate_control(struct ieee80211_local *local, @@ -341,7 +334,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, spin_lock_init(&sta->lock); spin_lock_init(&sta->ps_lock); - INIT_WORK(&sta->drv_unblock_wk, sta_unblock); + INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames); INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); mutex_init(&sta->ampdu_mlme.mtx); #ifdef CONFIG_MAC80211_MESH @@ -358,7 +351,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->sta_state = IEEE80211_STA_NONE; - do_posix_clock_monotonic_gettime(&uptime); + ktime_get_ts(&uptime); sta->last_connected = uptime.tv_sec; ewma_init(&sta->avg_signal, 1024, 8); for (i = 0; i < ARRAY_SIZE(sta->chain_signal_avg); i++) @@ -1102,8 +1095,11 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) unsigned long flags; struct ps_data *ps; - if (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, + u.ap); + + if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; @@ -1141,8 +1137,15 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) } ieee80211_add_pending_skbs(local, &pending); - clear_sta_flag(sta, WLAN_STA_PS_DRIVER); - clear_sta_flag(sta, WLAN_STA_PS_STA); + + /* now we're no longer in the deliver code */ + clear_sta_flag(sta, WLAN_STA_PS_DELIVER); + + /* The station might have polled and then woken up before we responded, + * so clear these flags now to avoid them sticking around. + */ + clear_sta_flag(sta, WLAN_STA_PSPOLL); + clear_sta_flag(sta, WLAN_STA_UAPSD); spin_unlock(&sta->ps_lock); atomic_dec(&ps->num_sta_ps); @@ -1180,7 +1183,7 @@ static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; - bool qos = test_sta_flag(sta, WLAN_STA_WME); + bool qos = sta->sta.wme; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; @@ -1543,10 +1546,26 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, trace_api_sta_block_awake(sta->local, pubsta, block); - if (block) + if (block) { set_sta_flag(sta, WLAN_STA_PS_DRIVER); - else if (test_sta_flag(sta, WLAN_STA_PS_DRIVER)) - ieee80211_queue_work(hw, &sta->drv_unblock_wk); + return; + } + + if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER)) + return; + + if (!test_sta_flag(sta, WLAN_STA_PS_STA)) { + set_sta_flag(sta, WLAN_STA_PS_DELIVER); + clear_sta_flag(sta, WLAN_STA_PS_DRIVER); + ieee80211_queue_work(hw, &sta->drv_deliver_wk); + } else if (test_sta_flag(sta, WLAN_STA_PSPOLL) || + test_sta_flag(sta, WLAN_STA_UAPSD)) { + /* must be asleep in this case */ + clear_sta_flag(sta, WLAN_STA_PS_DRIVER); + ieee80211_queue_work(hw, &sta->drv_deliver_wk); + } else { + clear_sta_flag(sta, WLAN_STA_PS_DRIVER); + } } EXPORT_SYMBOL(ieee80211_sta_block_awake); @@ -1704,3 +1723,140 @@ u8 sta_info_tx_streams(struct sta_info *sta) return ((ht_cap->mcs.tx_params & IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK) >> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1; } + +void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + struct rate_control_ref *ref = NULL; + struct timespec uptime; + u64 packets = 0; + u32 thr = 0; + int i, ac; + + if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) + ref = local->rate_ctrl; + + sinfo->generation = sdata->local->sta_generation; + + sinfo->filled = STATION_INFO_INACTIVE_TIME | + STATION_INFO_RX_BYTES64 | + STATION_INFO_TX_BYTES64 | + STATION_INFO_RX_PACKETS | + STATION_INFO_TX_PACKETS | + STATION_INFO_TX_RETRIES | + STATION_INFO_TX_FAILED | + STATION_INFO_TX_BITRATE | + STATION_INFO_RX_BITRATE | + STATION_INFO_RX_DROP_MISC | + STATION_INFO_BSS_PARAM | + STATION_INFO_CONNECTED_TIME | + STATION_INFO_STA_FLAGS | + STATION_INFO_BEACON_LOSS_COUNT; + + ktime_get_ts(&uptime); + sinfo->connected_time = uptime.tv_sec - sta->last_connected; + + sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); + sinfo->tx_bytes = 0; + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + sinfo->tx_bytes += sta->tx_bytes[ac]; + packets += sta->tx_packets[ac]; + } + sinfo->tx_packets = packets; + sinfo->rx_bytes = sta->rx_bytes; + sinfo->rx_packets = sta->rx_packets; + sinfo->tx_retries = sta->tx_retry_count; + sinfo->tx_failed = sta->tx_retry_failed; + sinfo->rx_dropped_misc = sta->rx_dropped; + sinfo->beacon_loss_count = sta->beacon_loss_count; + + if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) || + (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) { + sinfo->filled |= STATION_INFO_SIGNAL | STATION_INFO_SIGNAL_AVG; + if (!local->ops->get_rssi || + drv_get_rssi(local, sdata, &sta->sta, &sinfo->signal)) + sinfo->signal = (s8)sta->last_signal; + sinfo->signal_avg = (s8) -ewma_read(&sta->avg_signal); + } + if (sta->chains) { + sinfo->filled |= STATION_INFO_CHAIN_SIGNAL | + STATION_INFO_CHAIN_SIGNAL_AVG; + + sinfo->chains = sta->chains; + for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { + sinfo->chain_signal[i] = sta->chain_signal_last[i]; + sinfo->chain_signal_avg[i] = + (s8) -ewma_read(&sta->chain_signal_avg[i]); + } + } + + sta_set_rate_info_tx(sta, &sta->last_tx_rate, &sinfo->txrate); + sta_set_rate_info_rx(sta, &sinfo->rxrate); + + if (ieee80211_vif_is_mesh(&sdata->vif)) { +#ifdef CONFIG_MAC80211_MESH + sinfo->filled |= STATION_INFO_LLID | + STATION_INFO_PLID | + STATION_INFO_PLINK_STATE | + STATION_INFO_LOCAL_PM | + STATION_INFO_PEER_PM | + STATION_INFO_NONPEER_PM; + + sinfo->llid = sta->llid; + sinfo->plid = sta->plid; + sinfo->plink_state = sta->plink_state; + if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { + sinfo->filled |= STATION_INFO_T_OFFSET; + sinfo->t_offset = sta->t_offset; + } + sinfo->local_pm = sta->local_pm; + sinfo->peer_pm = sta->peer_pm; + sinfo->nonpeer_pm = sta->nonpeer_pm; +#endif + } + + sinfo->bss_param.flags = 0; + if (sdata->vif.bss_conf.use_cts_prot) + sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; + if (sdata->vif.bss_conf.use_short_preamble) + sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; + if (sdata->vif.bss_conf.use_short_slot) + sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; + sinfo->bss_param.dtim_period = sdata->vif.bss_conf.dtim_period; + sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int; + + sinfo->sta_flags.set = 0; + sinfo->sta_flags.mask = BIT(NL80211_STA_FLAG_AUTHORIZED) | + BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | + BIT(NL80211_STA_FLAG_WME) | + BIT(NL80211_STA_FLAG_MFP) | + BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_ASSOCIATED) | + BIT(NL80211_STA_FLAG_TDLS_PEER); + if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); + if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE)) + sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE); + if (sta->sta.wme) + sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME); + if (test_sta_flag(sta, WLAN_STA_MFP)) + sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); + if (test_sta_flag(sta, WLAN_STA_AUTH)) + sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED); + if (test_sta_flag(sta, WLAN_STA_ASSOC)) + sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED); + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) + sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); + + /* check if the driver has a SW RC implementation */ + if (ref && ref->ops->get_expected_throughput) + thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); + else + thr = drv_get_expected_throughput(local, &sta->sta); + + if (thr != 0) { + sinfo->filled |= STATION_INFO_EXPECTED_THROUGHPUT; + sinfo->expected_throughput = thr; + } +} diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 4acc5fc402fa..42f68cb8957e 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -1,5 +1,6 @@ /* * Copyright 2002-2005, Devicescape Software, Inc. + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,7 +32,6 @@ * when virtual port control is not in use. * @WLAN_STA_SHORT_PREAMBLE: Station is capable of receiving short-preamble * frames. - * @WLAN_STA_WME: Station is a QoS-STA. * @WLAN_STA_WDS: Station is one of our WDS peers. * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the * IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next @@ -47,6 +47,8 @@ * @WLAN_STA_TDLS_PEER: Station is a TDLS peer. * @WLAN_STA_TDLS_PEER_AUTH: This TDLS peer is authorized to send direct * packets. This means the link is enabled. + * @WLAN_STA_TDLS_INITIATOR: We are the initiator of the TDLS link with this + * station. * @WLAN_STA_UAPSD: Station requested unscheduled SP while driver was * keeping station in power-save mode, reply when the driver * unblocks the station. @@ -58,6 +60,8 @@ * @WLAN_STA_TOFFSET_KNOWN: toffset calculated for this station is valid. * @WLAN_STA_MPSP_OWNER: local STA is owner of a mesh Peer Service Period. * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP. + * @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX + * until pending frames are delivered */ enum ieee80211_sta_info_flags { WLAN_STA_AUTH, @@ -65,7 +69,6 @@ enum ieee80211_sta_info_flags { WLAN_STA_PS_STA, WLAN_STA_AUTHORIZED, WLAN_STA_SHORT_PREAMBLE, - WLAN_STA_WME, WLAN_STA_WDS, WLAN_STA_CLEAR_PS_FILT, WLAN_STA_MFP, @@ -74,6 +77,7 @@ enum ieee80211_sta_info_flags { WLAN_STA_PSPOLL, WLAN_STA_TDLS_PEER, WLAN_STA_TDLS_PEER_AUTH, + WLAN_STA_TDLS_INITIATOR, WLAN_STA_UAPSD, WLAN_STA_SP, WLAN_STA_4ADDR_EVENT, @@ -82,6 +86,7 @@ enum ieee80211_sta_info_flags { WLAN_STA_TOFFSET_KNOWN, WLAN_STA_MPSP_OWNER, WLAN_STA_MPSP_RECIPIENT, + WLAN_STA_PS_DELIVER, }; #define ADDBA_RESP_INTERVAL HZ @@ -149,7 +154,8 @@ struct tid_ampdu_tx { /** * struct tid_ampdu_rx - TID aggregation information (Rx). * - * @reorder_buf: buffer to reorder incoming aggregated MPDUs + * @reorder_buf: buffer to reorder incoming aggregated MPDUs. An MPDU may be an + * A-MSDU with individually reported subframes. * @reorder_time: jiffies when skb was added * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) * @reorder_timer: releases expired frames from the reorder buffer. @@ -162,6 +168,8 @@ struct tid_ampdu_tx { * @dialog_token: dialog token for aggregation session * @rcu_head: RCU head used for freeing this struct * @reorder_lock: serializes access to reorder buffer, see below. + * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and + * and ssn. * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. @@ -174,7 +182,7 @@ struct tid_ampdu_tx { struct tid_ampdu_rx { struct rcu_head rcu_head; spinlock_t reorder_lock; - struct sk_buff **reorder_buf; + struct sk_buff_head *reorder_buf; unsigned long *reorder_time; struct timer_list session_timer; struct timer_list reorder_timer; @@ -185,6 +193,7 @@ struct tid_ampdu_rx { u16 buf_size; u16 timeout; u8 dialog_token; + bool auto_seq; }; /** @@ -265,7 +274,7 @@ struct ieee80211_tx_latency_stat { * @last_rx_rate_vht_nss: rx status nss of last data packet * @lock: used for locking all fields that require locking, see comments * in the header file. - * @drv_unblock_wk: used for driver PS unblocking + * @drv_deliver_wk: used for delivering frames after driver PS unblocking * @listen_interval: listen interval of this station, when we're acting as AP * @_flags: STA flags, see &enum ieee80211_sta_info_flags, do not use directly * @ps_lock: used for powersave (when mac80211 is the AP) related locking @@ -278,7 +287,6 @@ struct ieee80211_tx_latency_stat { * @driver_buffered_tids: bitmap of TIDs the driver has data buffered on * @rx_packets: Number of MSDUs received from this STA * @rx_bytes: Number of bytes received from this STA - * @wep_weak_iv_count: number of weak WEP IVs received from this station * @last_rx: time (in jiffies) when last frame was received from this STA * @last_connected: time (in seconds) when a station got connected * @num_duplicates: number of duplicate frames received from this STA @@ -303,7 +311,6 @@ struct ieee80211_tx_latency_stat { * @plid: Peer link ID * @reason: Cancel reason on PLINK_HOLDING state * @plink_retries: Retries in establishment - * @ignore_plink_timer: ignore the peer-link timer (used internally) * @plink_state: peer link state * @plink_timeout: timeout of peer link * @plink_timer: peer link watch timer @@ -345,7 +352,7 @@ struct sta_info { void *rate_ctrl_priv; spinlock_t lock; - struct work_struct drv_unblock_wk; + struct work_struct drv_deliver_wk; u16 listen_interval; @@ -367,7 +374,6 @@ struct sta_info { /* Updated from RX path only, no locking requirements */ unsigned long rx_packets; u64 rx_bytes; - unsigned long wep_weak_iv_count; unsigned long last_rx; long last_connected; unsigned long num_duplicates; @@ -418,7 +424,6 @@ struct sta_info { u16 plid; u16 reason; u8 plink_retries; - bool ignore_plink_timer; enum nl80211_plink_state plink_state; u32 plink_timeout; struct timer_list plink_timer; @@ -445,6 +450,9 @@ struct sta_info { enum ieee80211_smps_mode known_smps_mode; const struct ieee80211_cipher_scheme *cipher_scheme; + /* TDLS timeout data */ + unsigned long last_tdls_pkt_time; + /* keep last! */ struct ieee80211_sta sta; }; @@ -628,6 +636,8 @@ void sta_set_rate_info_tx(struct sta_info *sta, struct rate_info *rinfo); void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo); +void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo); + void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time); u8 sta_info_tx_streams(struct sta_info *sta); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index ba29ebc86141..89290e33dafe 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -473,8 +474,6 @@ static void ieee80211_tx_latency_end_msrmnt(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_hdr *hdr) { - ktime_t skb_dprt; - struct timespec dprt_time; u32 msrmnt; u16 tid; u8 *qc; @@ -506,9 +505,8 @@ static void ieee80211_tx_latency_end_msrmnt(struct ieee80211_local *local, tx_lat = &sta->tx_lat[tid]; - ktime_get_ts(&dprt_time); /* time stamp completion time */ - skb_dprt = ktime_set(dprt_time.tv_sec, dprt_time.tv_nsec); - msrmnt = ktime_to_ms(ktime_sub(skb_dprt, skb_arv)); + /* Calculate the latency */ + msrmnt = ktime_to_ms(ktime_sub(ktime_get(), skb_arv)); if (tx_lat->max < msrmnt) /* update stats */ tx_lat->max = msrmnt; @@ -540,6 +538,8 @@ static void ieee80211_tx_latency_end_msrmnt(struct ieee80211_local *local, * - current throughput (higher value for higher tpt)? */ #define STA_LOST_PKT_THRESHOLD 50 +#define STA_LOST_TDLS_PKT_THRESHOLD 10 +#define STA_LOST_TDLS_PKT_TIME (10*HZ) /* 10secs since last ACK */ static void ieee80211_lost_packet(struct sta_info *sta, struct sk_buff *skb) { @@ -550,7 +550,20 @@ static void ieee80211_lost_packet(struct sta_info *sta, struct sk_buff *skb) !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; - if (++sta->lost_packets < STA_LOST_PKT_THRESHOLD) + sta->lost_packets++; + if (!sta->sta.tdls && sta->lost_packets < STA_LOST_PKT_THRESHOLD) + return; + + /* + * If we're in TDLS mode, make sure that all STA_LOST_TDLS_PKT_THRESHOLD + * of the last packets were lost, and that no ACK was received in the + * last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss + * mechanism. + */ + if (sta->sta.tdls && + (sta->lost_packets < STA_LOST_TDLS_PKT_THRESHOLD || + time_before(jiffies, + sta->last_tdls_pkt_time + STA_LOST_TDLS_PKT_TIME))) return; cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, @@ -697,6 +710,10 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if (info->flags & IEEE80211_TX_STAT_ACK) { if (sta->lost_packets) sta->lost_packets = 0; + + /* Track when last TDLS packet was ACKed */ + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) + sta->last_tdls_pkt_time = jiffies; } else { ieee80211_lost_packet(sta, skb); } diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 652813b2d3df..4ea25dec0698 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -3,12 +3,37 @@ * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2014, Intel Corporation + * Copyright 2014 Intel Mobile Communications GmbH * * This file is GPLv2 as found in COPYING. */ #include <linux/ieee80211.h> +#include <linux/log2.h> +#include <net/cfg80211.h> #include "ieee80211_i.h" +#include "driver-ops.h" + +/* give usermode some time for retries in setting up the TDLS session */ +#define TDLS_PEER_SETUP_TIMEOUT (15 * HZ) + +void ieee80211_tdls_peer_del_work(struct work_struct *wk) +{ + struct ieee80211_sub_if_data *sdata; + struct ieee80211_local *local; + + sdata = container_of(wk, struct ieee80211_sub_if_data, + u.mgd.tdls_peer_del_work.work); + local = sdata->local; + + mutex_lock(&local->mtx); + if (!is_zero_ether_addr(sdata->u.mgd.tdls_peer)) { + tdls_dbg(sdata, "TDLS del peer %pM\n", sdata->u.mgd.tdls_peer); + sta_info_destroy_addr(sdata, sdata->u.mgd.tdls_peer); + eth_zero_addr(sdata->u.mgd.tdls_peer); + } + mutex_unlock(&local->mtx); +} static void ieee80211_tdls_add_ext_capab(struct sk_buff *skb) { @@ -23,11 +48,16 @@ static void ieee80211_tdls_add_ext_capab(struct sk_buff *skb) *pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED; } -static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata) +static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata, + u16 status_code) { struct ieee80211_local *local = sdata->local; u16 capab; + /* The capability will be 0 when sending a failure code */ + if (status_code != 0) + return 0; + capab = 0; if (ieee80211_get_sdata_band(sdata) != IEEE80211_BAND_2GHZ) return capab; @@ -40,19 +70,331 @@ static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata) return capab; } -static void ieee80211_tdls_add_link_ie(struct sk_buff *skb, const u8 *src_addr, - const u8 *peer, const u8 *bssid) +static void ieee80211_tdls_add_link_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, const u8 *peer, + bool initiator) { struct ieee80211_tdls_lnkie *lnkid; + const u8 *init_addr, *rsp_addr; + + if (initiator) { + init_addr = sdata->vif.addr; + rsp_addr = peer; + } else { + init_addr = peer; + rsp_addr = sdata->vif.addr; + } lnkid = (void *)skb_put(skb, sizeof(struct ieee80211_tdls_lnkie)); lnkid->ie_type = WLAN_EID_LINK_ID; lnkid->ie_len = sizeof(struct ieee80211_tdls_lnkie) - 2; - memcpy(lnkid->bssid, bssid, ETH_ALEN); - memcpy(lnkid->init_sta, src_addr, ETH_ALEN); - memcpy(lnkid->resp_sta, peer, ETH_ALEN); + memcpy(lnkid->bssid, sdata->u.mgd.bssid, ETH_ALEN); + memcpy(lnkid->init_sta, init_addr, ETH_ALEN); + memcpy(lnkid->resp_sta, rsp_addr, ETH_ALEN); +} + +/* translate numbering in the WMM parameter IE to the mac80211 notation */ +static enum ieee80211_ac_numbers ieee80211_ac_from_wmm(int ac) +{ + switch (ac) { + default: + WARN_ON_ONCE(1); + case 0: + return IEEE80211_AC_BE; + case 1: + return IEEE80211_AC_BK; + case 2: + return IEEE80211_AC_VI; + case 3: + return IEEE80211_AC_VO; + } +} + +static u8 ieee80211_wmm_aci_aifsn(int aifsn, bool acm, int aci) +{ + u8 ret; + + ret = aifsn & 0x0f; + if (acm) + ret |= 0x10; + ret |= (aci << 5) & 0x60; + return ret; +} + +static u8 ieee80211_wmm_ecw(u16 cw_min, u16 cw_max) +{ + return ((ilog2(cw_min + 1) << 0x0) & 0x0f) | + ((ilog2(cw_max + 1) << 0x4) & 0xf0); +} + +static void ieee80211_tdls_add_wmm_param_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_wmm_param_ie *wmm; + struct ieee80211_tx_queue_params *txq; + int i; + + wmm = (void *)skb_put(skb, sizeof(*wmm)); + memset(wmm, 0, sizeof(*wmm)); + + wmm->element_id = WLAN_EID_VENDOR_SPECIFIC; + wmm->len = sizeof(*wmm) - 2; + + wmm->oui[0] = 0x00; /* Microsoft OUI 00:50:F2 */ + wmm->oui[1] = 0x50; + wmm->oui[2] = 0xf2; + wmm->oui_type = 2; /* WME */ + wmm->oui_subtype = 1; /* WME param */ + wmm->version = 1; /* WME ver */ + wmm->qos_info = 0; /* U-APSD not in use */ + + /* + * Use the EDCA parameters defined for the BSS, or default if the AP + * doesn't support it, as mandated by 802.11-2012 section 10.22.4 + */ + for (i = 0; i < IEEE80211_NUM_ACS; i++) { + txq = &sdata->tx_conf[ieee80211_ac_from_wmm(i)]; + wmm->ac[i].aci_aifsn = ieee80211_wmm_aci_aifsn(txq->aifs, + txq->acm, i); + wmm->ac[i].cw = ieee80211_wmm_ecw(txq->cw_min, txq->cw_max); + wmm->ac[i].txop_limit = cpu_to_le16(txq->txop); + } +} + +static void +ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, const u8 *peer, + u8 action_code, bool initiator, + const u8 *extra_ies, size_t extra_ies_len) +{ + enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + struct ieee80211_sta_ht_cap ht_cap; + struct sta_info *sta = NULL; + size_t offset = 0, noffset; + u8 *pos; + + rcu_read_lock(); + + /* we should have the peer STA if we're already responding */ + if (action_code == WLAN_TDLS_SETUP_RESPONSE) { + sta = sta_info_get(sdata, peer); + if (WARN_ON_ONCE(!sta)) { + rcu_read_unlock(); + return; + } + } + + ieee80211_add_srates_ie(sdata, skb, false, band); + ieee80211_add_ext_srates_ie(sdata, skb, false, band); + + /* add any custom IEs that go before Extended Capabilities */ + if (extra_ies_len) { + static const u8 before_ext_cap[] = { + WLAN_EID_SUPP_RATES, + WLAN_EID_COUNTRY, + WLAN_EID_EXT_SUPP_RATES, + WLAN_EID_SUPPORTED_CHANNELS, + WLAN_EID_RSN, + }; + noffset = ieee80211_ie_split(extra_ies, extra_ies_len, + before_ext_cap, + ARRAY_SIZE(before_ext_cap), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, extra_ies + offset, noffset - offset); + offset = noffset; + } + + ieee80211_tdls_add_ext_capab(skb); + + /* add the QoS element if we support it */ + if (local->hw.queues >= IEEE80211_NUM_ACS && + action_code != WLAN_PUB_ACTION_TDLS_DISCOVER_RES) + ieee80211_add_wmm_info_ie(skb_put(skb, 9), 0); /* no U-APSD */ + + /* add any custom IEs that go before HT capabilities */ + if (extra_ies_len) { + static const u8 before_ht_cap[] = { + WLAN_EID_SUPP_RATES, + WLAN_EID_COUNTRY, + WLAN_EID_EXT_SUPP_RATES, + WLAN_EID_SUPPORTED_CHANNELS, + WLAN_EID_RSN, + WLAN_EID_EXT_CAPABILITY, + WLAN_EID_QOS_CAPA, + WLAN_EID_FAST_BSS_TRANSITION, + WLAN_EID_TIMEOUT_INTERVAL, + WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + }; + noffset = ieee80211_ie_split(extra_ies, extra_ies_len, + before_ht_cap, + ARRAY_SIZE(before_ht_cap), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, extra_ies + offset, noffset - offset); + offset = noffset; + } + + /* + * with TDLS we can switch channels, and HT-caps are not necessarily + * the same on all bands. The specification limits the setup to a + * single HT-cap, so use the current band for now. + */ + sband = local->hw.wiphy->bands[band]; + memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap)); + if ((action_code == WLAN_TDLS_SETUP_REQUEST || + action_code == WLAN_TDLS_SETUP_RESPONSE) && + ht_cap.ht_supported && (!sta || sta->sta.ht_cap.ht_supported)) { + if (action_code == WLAN_TDLS_SETUP_REQUEST) { + ieee80211_apply_htcap_overrides(sdata, &ht_cap); + + /* disable SMPS in TDLS initiator */ + ht_cap.cap |= (WLAN_HT_CAP_SM_PS_DISABLED + << IEEE80211_HT_CAP_SM_PS_SHIFT); + } else { + /* disable SMPS in TDLS responder */ + sta->sta.ht_cap.cap |= + (WLAN_HT_CAP_SM_PS_DISABLED + << IEEE80211_HT_CAP_SM_PS_SHIFT); + + /* the peer caps are already intersected with our own */ + memcpy(&ht_cap, &sta->sta.ht_cap, sizeof(ht_cap)); + } + + pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2); + ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap); + } + + rcu_read_unlock(); + + /* add any remaining IEs */ + if (extra_ies_len) { + noffset = extra_ies_len; + pos = skb_put(skb, noffset - offset); + memcpy(pos, extra_ies + offset, noffset - offset); + } + + ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); +} + +static void +ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, const u8 *peer, + bool initiator, const u8 *extra_ies, + size_t extra_ies_len) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + size_t offset = 0, noffset; + struct sta_info *sta, *ap_sta; + u8 *pos; + + rcu_read_lock(); + + sta = sta_info_get(sdata, peer); + ap_sta = sta_info_get(sdata, ifmgd->bssid); + if (WARN_ON_ONCE(!sta || !ap_sta)) { + rcu_read_unlock(); + return; + } + + /* add any custom IEs that go before the QoS IE */ + if (extra_ies_len) { + static const u8 before_qos[] = { + WLAN_EID_RSN, + }; + noffset = ieee80211_ie_split(extra_ies, extra_ies_len, + before_qos, + ARRAY_SIZE(before_qos), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, extra_ies + offset, noffset - offset); + offset = noffset; + } + + /* add the QoS param IE if both the peer and we support it */ + if (local->hw.queues >= IEEE80211_NUM_ACS && sta->sta.wme) + ieee80211_tdls_add_wmm_param_ie(sdata, skb); + + /* add any custom IEs that go before HT operation */ + if (extra_ies_len) { + static const u8 before_ht_op[] = { + WLAN_EID_RSN, + WLAN_EID_QOS_CAPA, + WLAN_EID_FAST_BSS_TRANSITION, + WLAN_EID_TIMEOUT_INTERVAL, + }; + noffset = ieee80211_ie_split(extra_ies, extra_ies_len, + before_ht_op, + ARRAY_SIZE(before_ht_op), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, extra_ies + offset, noffset - offset); + offset = noffset; + } + + /* if HT support is only added in TDLS, we need an HT-operation IE */ + if (!ap_sta->sta.ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) { + struct ieee80211_chanctx_conf *chanctx_conf = + rcu_dereference(sdata->vif.chanctx_conf); + if (!WARN_ON(!chanctx_conf)) { + pos = skb_put(skb, 2 + + sizeof(struct ieee80211_ht_operation)); + /* send an empty HT operation IE */ + ieee80211_ie_build_ht_oper(pos, &sta->sta.ht_cap, + &chanctx_conf->def, 0); + } + } + + rcu_read_unlock(); + + /* add any remaining IEs */ + if (extra_ies_len) { + noffset = extra_ies_len; + pos = skb_put(skb, noffset - offset); + memcpy(pos, extra_ies + offset, noffset - offset); + } + + ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); +} + +static void ieee80211_tdls_add_ies(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, const u8 *peer, + u8 action_code, u16 status_code, + bool initiator, const u8 *extra_ies, + size_t extra_ies_len) +{ + switch (action_code) { + case WLAN_TDLS_SETUP_REQUEST: + case WLAN_TDLS_SETUP_RESPONSE: + case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: + if (status_code == 0) + ieee80211_tdls_add_setup_start_ies(sdata, skb, peer, + action_code, + initiator, + extra_ies, + extra_ies_len); + break; + case WLAN_TDLS_SETUP_CONFIRM: + if (status_code == 0) + ieee80211_tdls_add_setup_cfm_ies(sdata, skb, peer, + initiator, extra_ies, + extra_ies_len); + break; + case WLAN_TDLS_TEARDOWN: + case WLAN_TDLS_DISCOVERY_REQUEST: + if (extra_ies_len) + memcpy(skb_put(skb, extra_ies_len), extra_ies, + extra_ies_len); + if (status_code == 0 || action_code == WLAN_TDLS_TEARDOWN) + ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); + break; + } + } static int @@ -61,7 +403,6 @@ ieee80211_prep_tdls_encap_data(struct wiphy *wiphy, struct net_device *dev, u16 status_code, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_tdls_data *tf; tf = (void *)skb_put(skb, offsetof(struct ieee80211_tdls_data, u)); @@ -71,6 +412,9 @@ ieee80211_prep_tdls_encap_data(struct wiphy *wiphy, struct net_device *dev, tf->ether_type = cpu_to_be16(ETH_P_TDLS); tf->payload_type = WLAN_TDLS_SNAP_RFTYPE; + /* network header is after the ethernet header */ + skb_set_network_header(skb, ETH_HLEN); + switch (action_code) { case WLAN_TDLS_SETUP_REQUEST: tf->category = WLAN_CATEGORY_TDLS; @@ -79,11 +423,8 @@ ieee80211_prep_tdls_encap_data(struct wiphy *wiphy, struct net_device *dev, skb_put(skb, sizeof(tf->u.setup_req)); tf->u.setup_req.dialog_token = dialog_token; tf->u.setup_req.capability = - cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata)); - - ieee80211_add_srates_ie(sdata, skb, false, band); - ieee80211_add_ext_srates_ie(sdata, skb, false, band); - ieee80211_tdls_add_ext_capab(skb); + cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata, + status_code)); break; case WLAN_TDLS_SETUP_RESPONSE: tf->category = WLAN_CATEGORY_TDLS; @@ -93,11 +434,8 @@ ieee80211_prep_tdls_encap_data(struct wiphy *wiphy, struct net_device *dev, tf->u.setup_resp.status_code = cpu_to_le16(status_code); tf->u.setup_resp.dialog_token = dialog_token; tf->u.setup_resp.capability = - cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata)); - - ieee80211_add_srates_ie(sdata, skb, false, band); - ieee80211_add_ext_srates_ie(sdata, skb, false, band); - ieee80211_tdls_add_ext_capab(skb); + cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata, + status_code)); break; case WLAN_TDLS_SETUP_CONFIRM: tf->category = WLAN_CATEGORY_TDLS; @@ -134,7 +472,6 @@ ieee80211_prep_tdls_direct(struct wiphy *wiphy, struct net_device *dev, u16 status_code, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_mgmt *mgmt; mgmt = (void *)skb_put(skb, 24); @@ -155,11 +492,8 @@ ieee80211_prep_tdls_direct(struct wiphy *wiphy, struct net_device *dev, mgmt->u.action.u.tdls_discover_resp.dialog_token = dialog_token; mgmt->u.action.u.tdls_discover_resp.capability = - cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata)); - - ieee80211_add_srates_ie(sdata, skb, false, band); - ieee80211_add_ext_srates_ie(sdata, skb, false, band); - ieee80211_tdls_add_ext_capab(skb); + cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata, + status_code)); break; default: return -EINVAL; @@ -168,33 +502,28 @@ ieee80211_prep_tdls_direct(struct wiphy *wiphy, struct net_device *dev, return 0; } -int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, - const u8 *peer, u8 action_code, u8 dialog_token, - u16 status_code, u32 peer_capability, - const u8 *extra_ies, size_t extra_ies_len) +static int +ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, + const u8 *peer, u8 action_code, + u8 dialog_token, u16 status_code, + u32 peer_capability, bool initiator, + const u8 *extra_ies, size_t extra_ies_len) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sk_buff *skb = NULL; bool send_direct; + struct sta_info *sta; int ret; - if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)) - return -ENOTSUPP; - - /* make sure we are in managed mode, and associated */ - if (sdata->vif.type != NL80211_IFTYPE_STATION || - !sdata->u.mgd.associated) - return -EINVAL; - - tdls_dbg(sdata, "TDLS mgmt action %d peer %pM\n", - action_code, peer); - skb = dev_alloc_skb(local->hw.extra_tx_headroom + max(sizeof(struct ieee80211_mgmt), sizeof(struct ieee80211_tdls_data)) + 50 + /* supported rates */ 7 + /* ext capab */ + 26 + /* max(WMM-info, WMM-param) */ + 2 + max(sizeof(struct ieee80211_ht_cap), + sizeof(struct ieee80211_ht_operation)) + extra_ies_len + sizeof(struct ieee80211_tdls_lnkie)); if (!skb) @@ -227,30 +556,48 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, if (ret < 0) goto fail; - if (extra_ies_len) - memcpy(skb_put(skb, extra_ies_len), extra_ies, extra_ies_len); + rcu_read_lock(); + sta = sta_info_get(sdata, peer); - /* the TDLS link IE is always added last */ + /* infer the initiator if we can, to support old userspace */ switch (action_code) { case WLAN_TDLS_SETUP_REQUEST: + if (sta) + set_sta_flag(sta, WLAN_STA_TDLS_INITIATOR); + /* fall-through */ case WLAN_TDLS_SETUP_CONFIRM: - case WLAN_TDLS_TEARDOWN: case WLAN_TDLS_DISCOVERY_REQUEST: - /* we are the initiator */ - ieee80211_tdls_add_link_ie(skb, sdata->vif.addr, peer, - sdata->u.mgd.bssid); + initiator = true; break; case WLAN_TDLS_SETUP_RESPONSE: + /* + * In some testing scenarios, we send a request and response. + * Make the last packet sent take effect for the initiator + * value. + */ + if (sta) + clear_sta_flag(sta, WLAN_STA_TDLS_INITIATOR); + /* fall-through */ case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: - /* we are the responder */ - ieee80211_tdls_add_link_ie(skb, peer, sdata->vif.addr, - sdata->u.mgd.bssid); + initiator = false; + break; + case WLAN_TDLS_TEARDOWN: + /* any value is ok */ break; default: ret = -ENOTSUPP; - goto fail; + break; } + if (sta && test_sta_flag(sta, WLAN_STA_TDLS_INITIATOR)) + initiator = true; + + rcu_read_unlock(); + if (ret < 0) + goto fail; + + ieee80211_tdls_add_ies(sdata, skb, peer, action_code, status_code, + initiator, extra_ies, extra_ies_len); if (send_direct) { ieee80211_tx_skb(sdata, skb); return 0; @@ -284,11 +631,175 @@ fail: return ret; } +static int +ieee80211_tdls_mgmt_setup(struct wiphy *wiphy, struct net_device *dev, + const u8 *peer, u8 action_code, u8 dialog_token, + u16 status_code, u32 peer_capability, bool initiator, + const u8 *extra_ies, size_t extra_ies_len) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + int ret; + + mutex_lock(&local->mtx); + + /* we don't support concurrent TDLS peer setups */ + if (!is_zero_ether_addr(sdata->u.mgd.tdls_peer) && + !ether_addr_equal(sdata->u.mgd.tdls_peer, peer)) { + ret = -EBUSY; + goto exit; + } + + /* + * make sure we have a STA representing the peer so we drop or buffer + * non-TDLS-setup frames to the peer. We can't send other packets + * during setup through the AP path. + * Allow error packets to be sent - sometimes we don't even add a STA + * before failing the setup. + */ + if (status_code == 0) { + rcu_read_lock(); + if (!sta_info_get(sdata, peer)) { + rcu_read_unlock(); + ret = -ENOLINK; + goto exit; + } + rcu_read_unlock(); + } + + ieee80211_flush_queues(local, sdata); + + ret = ieee80211_tdls_prep_mgmt_packet(wiphy, dev, peer, action_code, + dialog_token, status_code, + peer_capability, initiator, + extra_ies, extra_ies_len); + if (ret < 0) + goto exit; + + memcpy(sdata->u.mgd.tdls_peer, peer, ETH_ALEN); + ieee80211_queue_delayed_work(&sdata->local->hw, + &sdata->u.mgd.tdls_peer_del_work, + TDLS_PEER_SETUP_TIMEOUT); + +exit: + mutex_unlock(&local->mtx); + return ret; +} + +static int +ieee80211_tdls_mgmt_teardown(struct wiphy *wiphy, struct net_device *dev, + const u8 *peer, u8 action_code, u8 dialog_token, + u16 status_code, u32 peer_capability, + bool initiator, const u8 *extra_ies, + size_t extra_ies_len) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + int ret; + + /* + * No packets can be transmitted to the peer via the AP during setup - + * the STA is set as a TDLS peer, but is not authorized. + * During teardown, we prevent direct transmissions by stopping the + * queues and flushing all direct packets. + */ + ieee80211_stop_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN); + ieee80211_flush_queues(local, sdata); + + ret = ieee80211_tdls_prep_mgmt_packet(wiphy, dev, peer, action_code, + dialog_token, status_code, + peer_capability, initiator, + extra_ies, extra_ies_len); + if (ret < 0) + sdata_err(sdata, "Failed sending TDLS teardown packet %d\n", + ret); + + /* + * Remove the STA AUTH flag to force further traffic through the AP. If + * the STA was unreachable, it was already removed. + */ + rcu_read_lock(); + sta = sta_info_get(sdata, peer); + if (sta) + clear_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH); + rcu_read_unlock(); + + ieee80211_wake_vif_queues(local, sdata, + IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN); + + return 0; +} + +int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, + const u8 *peer, u8 action_code, u8 dialog_token, + u16 status_code, u32 peer_capability, + bool initiator, const u8 *extra_ies, + size_t extra_ies_len) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + int ret; + + if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)) + return -ENOTSUPP; + + /* make sure we are in managed mode, and associated */ + if (sdata->vif.type != NL80211_IFTYPE_STATION || + !sdata->u.mgd.associated) + return -EINVAL; + + switch (action_code) { + case WLAN_TDLS_SETUP_REQUEST: + case WLAN_TDLS_SETUP_RESPONSE: + ret = ieee80211_tdls_mgmt_setup(wiphy, dev, peer, action_code, + dialog_token, status_code, + peer_capability, initiator, + extra_ies, extra_ies_len); + break; + case WLAN_TDLS_TEARDOWN: + ret = ieee80211_tdls_mgmt_teardown(wiphy, dev, peer, + action_code, dialog_token, + status_code, + peer_capability, initiator, + extra_ies, extra_ies_len); + break; + case WLAN_TDLS_DISCOVERY_REQUEST: + /* + * Protect the discovery so we can hear the TDLS discovery + * response frame. It is transmitted directly and not buffered + * by the AP. + */ + drv_mgd_protect_tdls_discover(sdata->local, sdata); + /* fall-through */ + case WLAN_TDLS_SETUP_CONFIRM: + case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: + /* no special handling */ + ret = ieee80211_tdls_prep_mgmt_packet(wiphy, dev, peer, + action_code, + dialog_token, + status_code, + peer_capability, + initiator, extra_ies, + extra_ies_len); + break; + default: + ret = -EOPNOTSUPP; + break; + } + + tdls_dbg(sdata, "TDLS mgmt action %d peer %pM status %d\n", + action_code, peer, ret); + return ret; +} + int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper) { struct sta_info *sta; struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + int ret; if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)) return -ENOTSUPP; @@ -296,6 +807,18 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EINVAL; + switch (oper) { + case NL80211_TDLS_ENABLE_LINK: + case NL80211_TDLS_DISABLE_LINK: + break; + case NL80211_TDLS_TEARDOWN: + case NL80211_TDLS_SETUP: + case NL80211_TDLS_DISCOVERY_REQ: + /* We don't support in-driver setup/teardown/discovery */ + return -ENOTSUPP; + } + + mutex_lock(&local->mtx); tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer); switch (oper) { @@ -304,22 +827,60 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, sta = sta_info_get(sdata, peer); if (!sta) { rcu_read_unlock(); - return -ENOLINK; + ret = -ENOLINK; + break; } set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH); rcu_read_unlock(); + + WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) || + !ether_addr_equal(sdata->u.mgd.tdls_peer, peer)); + ret = 0; break; case NL80211_TDLS_DISABLE_LINK: - return sta_info_destroy_addr(sdata, peer); - case NL80211_TDLS_TEARDOWN: - case NL80211_TDLS_SETUP: - case NL80211_TDLS_DISCOVERY_REQ: - /* We don't support in-driver setup/teardown/discovery */ - return -ENOTSUPP; + /* + * The teardown message in ieee80211_tdls_mgmt_teardown() was + * created while the queues were stopped, so it might still be + * pending. Before flushing the queues we need to be sure the + * message is handled by the tasklet handling pending messages, + * otherwise we might start destroying the station before + * sending the teardown packet. + * Note that this only forces the tasklet to flush pendings - + * not to stop the tasklet from rescheduling itself. + */ + tasklet_kill(&local->tx_pending_tasklet); + /* flush a potentially queued teardown packet */ + ieee80211_flush_queues(local, sdata); + + ret = sta_info_destroy_addr(sdata, peer); + break; default: - return -ENOTSUPP; + ret = -ENOTSUPP; + break; } - return 0; + if (ret == 0 && ether_addr_equal(sdata->u.mgd.tdls_peer, peer)) { + cancel_delayed_work(&sdata->u.mgd.tdls_peer_del_work); + eth_zero_addr(sdata->u.mgd.tdls_peer); + } + + mutex_unlock(&local->mtx); + return ret; +} + +void ieee80211_tdls_oper_request(struct ieee80211_vif *vif, const u8 *peer, + enum nl80211_tdls_operation oper, + u16 reason_code, gfp_t gfp) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + if (vif->type != NL80211_IFTYPE_STATION || !vif->bss_conf.assoc) { + sdata_err(sdata, "Discarding TDLS oper %d - not STA or disconnected\n", + oper); + return; + } + + cfg80211_tdls_oper_request(sdata->dev, peer, oper, reason_code, gfp); } +EXPORT_SYMBOL(ieee80211_tdls_oper_request); diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index cfe1a0688b5c..38fae7ebe984 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -672,13 +672,13 @@ DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold, ); TRACE_EVENT(drv_set_coverage_class, - TP_PROTO(struct ieee80211_local *local, u8 value), + TP_PROTO(struct ieee80211_local *local, s16 value), TP_ARGS(local, value), TP_STRUCT__entry( LOCAL_ENTRY - __field(u8, value) + __field(s16, value) ), TP_fast_assign( @@ -1330,6 +1330,13 @@ DEFINE_EVENT(local_sdata_evt, drv_mgd_prepare_tx, TP_ARGS(local, sdata) ); +DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + + TP_ARGS(local, sdata) +); + DECLARE_EVENT_CLASS(local_chanctx, TP_PROTO(struct ieee80211_local *local, struct ieee80211_chanctx *ctx), diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1a252c606ad0..900632a250ec 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -250,7 +251,8 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) if (local->hw.conf.flags & IEEE80211_CONF_PS) { ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_PS); + IEEE80211_QUEUE_STOP_REASON_PS, + false); ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; ieee80211_queue_work(&local->hw, &local->dynamic_ps_disable_work); @@ -473,7 +475,8 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) return TX_CONTINUE; if (unlikely((test_sta_flag(sta, WLAN_STA_PS_STA) || - test_sta_flag(sta, WLAN_STA_PS_DRIVER)) && + test_sta_flag(sta, WLAN_STA_PS_DRIVER) || + test_sta_flag(sta, WLAN_STA_PS_DELIVER)) && !(info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER))) { int ac = skb_get_queue_mapping(tx->skb); @@ -496,7 +499,8 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) * ahead and Tx the packet. */ if (!test_sta_flag(sta, WLAN_STA_PS_STA) && - !test_sta_flag(sta, WLAN_STA_PS_DRIVER)) { + !test_sta_flag(sta, WLAN_STA_PS_DRIVER) && + !test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { spin_unlock(&sta->ps_lock); return TX_CONTINUE; } @@ -1475,7 +1479,10 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, tail_need = max_t(int, tail_need, 0); } - if (skb_cloned(skb)) + if (skb_cloned(skb) && + (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CLONED_SKBS) || + !skb_clone_writable(skb, ETH_HLEN) || + sdata->crypto_tx_tailroom_needed_cnt)) I802_DEBUG_INC(local->tx_expand_skb_head_cloned); else if (head_need || tail_need) I802_DEBUG_INC(local->tx_expand_skb_head); @@ -1618,12 +1625,12 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, { struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_chanctx_conf *chanctx_conf; - struct ieee80211_channel *chan; struct ieee80211_radiotap_header *prthdr = (struct ieee80211_radiotap_header *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr; struct ieee80211_sub_if_data *tmp_sdata, *sdata; + struct cfg80211_chan_def *chandef; u16 len_rthdr; int hdrlen; @@ -1721,9 +1728,9 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, } if (chanctx_conf) - chan = chanctx_conf->def.chan; + chandef = &chanctx_conf->def; else if (!local->use_chanctx) - chan = local->_oper_chandef.chan; + chandef = &local->_oper_chandef; else goto fail_rcu; @@ -1743,10 +1750,11 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, * radar detection by itself. We can do that later by adding a * monitor flag interfaces used for AP support. */ - if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR))) + if (!cfg80211_reg_can_beacon(local->hw.wiphy, chandef, + sdata->vif.type)) goto fail_rcu; - ieee80211_xmit(sdata, skb, chan->band); + ieee80211_xmit(sdata, skb, chandef->chan->band); rcu_read_unlock(); return NETDEV_TX_OK; @@ -1767,15 +1775,12 @@ fail: static void ieee80211_tx_latency_start_msrmnt(struct ieee80211_local *local, struct sk_buff *skb) { - struct timespec skb_arv; struct ieee80211_tx_latency_bin_ranges *tx_latency; tx_latency = rcu_dereference(local->tx_latency); if (!tx_latency) return; - - ktime_get_ts(&skb_arv); - skb->tstamp = ktime_set(skb_arv.tv_sec, skb_arv.tv_nsec); + skb->tstamp = ktime_get(); } /** @@ -1784,9 +1789,8 @@ static void ieee80211_tx_latency_start_msrmnt(struct ieee80211_local *local, * @skb: packet to be sent * @dev: incoming interface * - * Returns: 0 on success (and frees skb in this case) or 1 on failure (skb will - * not be freed, and caller is responsible for either retrying later or freeing - * skb). + * Returns: NETDEV_TX_OK both on success and on failure. On failure skb will + * be freed. * * This function takes in an Ethernet header and encapsulates it with suitable * IEEE 802.11 header based on which interface the packet is coming in. The @@ -1810,7 +1814,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, int nh_pos, h_pos; struct sta_info *sta = NULL; bool wme_sta = false, authorized = false, tdls_auth = false; - bool tdls_direct = false; + bool tdls_peer = false, tdls_setup_frame = false; bool multicast; u32 info_flags = 0; u16 info_id = 0; @@ -1843,7 +1847,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN); hdrlen = 30; authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); - wme_sta = test_sta_flag(sta, WLAN_STA_WME); + wme_sta = sta->sta.wme; } ap_sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); @@ -1952,34 +1956,35 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, #endif case NL80211_IFTYPE_STATION: if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) { - bool tdls_peer = false; - sta = sta_info_get(sdata, skb->data); if (sta) { authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); - wme_sta = test_sta_flag(sta, WLAN_STA_WME); + wme_sta = sta->sta.wme; tdls_peer = test_sta_flag(sta, - WLAN_STA_TDLS_PEER); + WLAN_STA_TDLS_PEER); tdls_auth = test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH); } - /* - * If the TDLS link is enabled, send everything - * directly. Otherwise, allow TDLS setup frames - * to be transmitted indirectly. - */ - tdls_direct = tdls_peer && (tdls_auth || - !(ethertype == ETH_P_TDLS && skb->len > 14 && - skb->data[14] == WLAN_TDLS_SNAP_RFTYPE)); + if (tdls_peer) + tdls_setup_frame = + ethertype == ETH_P_TDLS && + skb->len > 14 && + skb->data[14] == WLAN_TDLS_SNAP_RFTYPE; } - if (tdls_direct) { - /* link during setup - throw out frames to peer */ - if (!tdls_auth) - goto fail_rcu; + /* + * TDLS link during setup - throw out frames to peer. We allow + * TDLS-setup frames to unauthorized peers for the special case + * of a link teardown after a TDLS sta is removed due to being + * unreachable. + */ + if (tdls_peer && !tdls_auth && !tdls_setup_frame) + goto fail_rcu; + /* send direct packets to authorized TDLS peers */ + if (tdls_peer && tdls_auth) { /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); @@ -2033,7 +2038,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, sta = sta_info_get(sdata, hdr.addr1); if (sta) { authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); - wme_sta = test_sta_flag(sta, WLAN_STA_WME); + wme_sta = sta->sta.wme; } } @@ -2067,30 +2072,23 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, if (unlikely(!multicast && skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) { - struct sk_buff *orig_skb = skb; + struct sk_buff *ack_skb = skb_clone_sk(skb); - skb = skb_clone(skb, GFP_ATOMIC); - if (skb) { + if (ack_skb) { unsigned long flags; int id; spin_lock_irqsave(&local->ack_status_lock, flags); - id = idr_alloc(&local->ack_status_frames, orig_skb, + id = idr_alloc(&local->ack_status_frames, ack_skb, 1, 0x10000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (id >= 0) { info_id = id; info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; - } else if (skb_shared(skb)) { - kfree_skb(orig_skb); } else { - kfree_skb(skb); - skb = orig_skb; + kfree_skb(ack_skb); } - } else { - /* couldn't clone -- lose tx status ... */ - skb = orig_skb; } } @@ -2423,7 +2421,7 @@ static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata, u8 *beacon_data; size_t beacon_data_len; int i; - u8 count = sdata->csa_current_counter; + u8 count = beacon->csa_current_counter; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: @@ -2442,46 +2440,53 @@ static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata, return; } + rcu_read_lock(); for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; ++i) { - u16 counter_offset_beacon = - sdata->csa_counter_offset_beacon[i]; - u16 counter_offset_presp = sdata->csa_counter_offset_presp[i]; - - if (counter_offset_beacon) { - if (WARN_ON(counter_offset_beacon >= beacon_data_len)) - return; + resp = rcu_dereference(sdata->u.ap.probe_resp); - beacon_data[counter_offset_beacon] = count; - } - - if (sdata->vif.type == NL80211_IFTYPE_AP && - counter_offset_presp) { - rcu_read_lock(); - resp = rcu_dereference(sdata->u.ap.probe_resp); - - /* If nl80211 accepted the offset, this should - * not happen. - */ - if (WARN_ON(!resp)) { + if (beacon->csa_counter_offsets[i]) { + if (WARN_ON_ONCE(beacon->csa_counter_offsets[i] >= + beacon_data_len)) { rcu_read_unlock(); return; } - resp->data[counter_offset_presp] = count; - rcu_read_unlock(); + + beacon_data[beacon->csa_counter_offsets[i]] = count; } + + if (sdata->vif.type == NL80211_IFTYPE_AP && resp) + resp->data[resp->csa_counter_offsets[i]] = count; } + rcu_read_unlock(); } u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct beacon_data *beacon = NULL; + u8 count = 0; - sdata->csa_current_counter--; + rcu_read_lock(); + + if (sdata->vif.type == NL80211_IFTYPE_AP) + beacon = rcu_dereference(sdata->u.ap.beacon); + else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + beacon = rcu_dereference(sdata->u.ibss.presp); + else if (ieee80211_vif_is_mesh(&sdata->vif)) + beacon = rcu_dereference(sdata->u.mesh.beacon); + + if (!beacon) + goto unlock; + + beacon->csa_current_counter--; /* the counter should never reach 0 */ - WARN_ON(!sdata->csa_current_counter); + WARN_ON_ONCE(!beacon->csa_current_counter); + count = beacon->csa_current_counter; - return sdata->csa_current_counter; +unlock: + rcu_read_unlock(); + return count; } EXPORT_SYMBOL(ieee80211_csa_update_counter); @@ -2491,7 +2496,6 @@ bool ieee80211_csa_is_complete(struct ieee80211_vif *vif) struct beacon_data *beacon = NULL; u8 *beacon_data; size_t beacon_data_len; - int counter_beacon = sdata->csa_counter_offset_beacon[0]; int ret = false; if (!ieee80211_sdata_running(sdata)) @@ -2529,10 +2533,13 @@ bool ieee80211_csa_is_complete(struct ieee80211_vif *vif) goto out; } - if (WARN_ON(counter_beacon > beacon_data_len)) + if (!beacon->csa_counter_offsets[0]) goto out; - if (beacon_data[counter_beacon] == 1) + if (WARN_ON_ONCE(beacon->csa_counter_offsets[0] > beacon_data_len)) + goto out; + + if (beacon_data[beacon->csa_counter_offsets[0]] == 1) ret = true; out: rcu_read_unlock(); @@ -2548,6 +2555,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, bool is_template) { struct ieee80211_local *local = hw_to_local(hw); + struct beacon_data *beacon = NULL; struct sk_buff *skb = NULL; struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata = NULL; @@ -2569,10 +2577,10 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (sdata->vif.type == NL80211_IFTYPE_AP) { struct ieee80211_if_ap *ap = &sdata->u.ap; - struct beacon_data *beacon = rcu_dereference(ap->beacon); + beacon = rcu_dereference(ap->beacon); if (beacon) { - if (sdata->vif.csa_active) { + if (beacon->csa_counter_offsets[0]) { if (!is_template) ieee80211_csa_update_counter(vif); @@ -2613,37 +2621,37 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_hdr *hdr; - struct beacon_data *presp = rcu_dereference(ifibss->presp); - if (!presp) + beacon = rcu_dereference(ifibss->presp); + if (!beacon) goto out; - if (sdata->vif.csa_active) { + if (beacon->csa_counter_offsets[0]) { if (!is_template) ieee80211_csa_update_counter(vif); - ieee80211_set_csa(sdata, presp); + ieee80211_set_csa(sdata, beacon); } - skb = dev_alloc_skb(local->tx_headroom + presp->head_len + + skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + local->hw.extra_beacon_tailroom); if (!skb) goto out; skb_reserve(skb, local->tx_headroom); - memcpy(skb_put(skb, presp->head_len), presp->head, - presp->head_len); + memcpy(skb_put(skb, beacon->head_len), beacon->head, + beacon->head_len); hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); } else if (ieee80211_vif_is_mesh(&sdata->vif)) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - struct beacon_data *bcn = rcu_dereference(ifmsh->beacon); - if (!bcn) + beacon = rcu_dereference(ifmsh->beacon); + if (!beacon) goto out; - if (sdata->vif.csa_active) { + if (beacon->csa_counter_offsets[0]) { if (!is_template) /* TODO: For mesh csa_counter is in TU, so * decrementing it by one isn't correct, but @@ -2652,40 +2660,42 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, */ ieee80211_csa_update_counter(vif); - ieee80211_set_csa(sdata, bcn); + ieee80211_set_csa(sdata, beacon); } if (ifmsh->sync_ops) - ifmsh->sync_ops->adjust_tbtt(sdata, bcn); + ifmsh->sync_ops->adjust_tbtt(sdata, beacon); skb = dev_alloc_skb(local->tx_headroom + - bcn->head_len + + beacon->head_len + 256 + /* TIM IE */ - bcn->tail_len + + beacon->tail_len + local->hw.extra_beacon_tailroom); if (!skb) goto out; skb_reserve(skb, local->tx_headroom); - memcpy(skb_put(skb, bcn->head_len), bcn->head, bcn->head_len); + memcpy(skb_put(skb, beacon->head_len), beacon->head, + beacon->head_len); ieee80211_beacon_add_tim(sdata, &ifmsh->ps, skb, is_template); if (offs) { - offs->tim_offset = bcn->head_len; - offs->tim_length = skb->len - bcn->head_len; + offs->tim_offset = beacon->head_len; + offs->tim_length = skb->len - beacon->head_len; } - memcpy(skb_put(skb, bcn->tail_len), bcn->tail, bcn->tail_len); + memcpy(skb_put(skb, beacon->tail_len), beacon->tail, + beacon->tail_len); } else { WARN_ON(1); goto out; } /* CSA offsets */ - if (offs) { + if (offs && beacon) { int i; for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; i++) { - u16 csa_off = sdata->csa_counter_offset_beacon[i]; + u16 csa_off = beacon->csa_counter_offsets[i]; if (!csa_off) continue; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index a6cda52ed920..3c61060a4d2b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -317,7 +318,8 @@ void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) } static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, - enum queue_stop_reason reason) + enum queue_stop_reason reason, + bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); @@ -329,7 +331,13 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, if (!test_bit(reason, &local->queue_stop_reasons[queue])) return; - __clear_bit(reason, &local->queue_stop_reasons[queue]); + if (!refcounted) + local->q_stop_reasons[queue][reason] = 0; + else + local->q_stop_reasons[queue][reason]--; + + if (local->q_stop_reasons[queue][reason] == 0) + __clear_bit(reason, &local->queue_stop_reasons[queue]); if (local->queue_stop_reasons[queue] != 0) /* someone still has this queue stopped */ @@ -344,25 +352,28 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, } void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, - enum queue_stop_reason reason) + enum queue_stop_reason reason, + bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - __ieee80211_wake_queue(hw, queue, reason); + __ieee80211_wake_queue(hw, queue, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue) { ieee80211_wake_queue_by_reason(hw, queue, - IEEE80211_QUEUE_STOP_REASON_DRIVER); + IEEE80211_QUEUE_STOP_REASON_DRIVER, + false); } EXPORT_SYMBOL(ieee80211_wake_queue); static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, - enum queue_stop_reason reason) + enum queue_stop_reason reason, + bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; @@ -373,10 +384,13 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, if (WARN_ON(queue >= hw->queues)) return; - if (test_bit(reason, &local->queue_stop_reasons[queue])) - return; + if (!refcounted) + local->q_stop_reasons[queue][reason] = 1; + else + local->q_stop_reasons[queue][reason]++; - __set_bit(reason, &local->queue_stop_reasons[queue]); + if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue])) + return; if (local->hw.queues < IEEE80211_NUM_ACS) n_acs = 1; @@ -398,20 +412,22 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, } void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, - enum queue_stop_reason reason) + enum queue_stop_reason reason, + bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - __ieee80211_stop_queue(hw, queue, reason); + __ieee80211_stop_queue(hw, queue, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue) { ieee80211_stop_queue_by_reason(hw, queue, - IEEE80211_QUEUE_STOP_REASON_DRIVER); + IEEE80211_QUEUE_STOP_REASON_DRIVER, + false); } EXPORT_SYMBOL(ieee80211_stop_queue); @@ -429,9 +445,11 @@ void ieee80211_add_pending_skb(struct ieee80211_local *local, } spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD); + __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, + false); __skb_queue_tail(&local->pending[queue], skb); - __ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD); + __ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD, + false); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } @@ -455,20 +473,23 @@ void ieee80211_add_pending_skbs(struct ieee80211_local *local, queue = info->hw_queue; __ieee80211_stop_queue(hw, queue, - IEEE80211_QUEUE_STOP_REASON_SKB_ADD); + IEEE80211_QUEUE_STOP_REASON_SKB_ADD, + false); __skb_queue_tail(&local->pending[queue], skb); } for (i = 0; i < hw->queues; i++) __ieee80211_wake_queue(hw, i, - IEEE80211_QUEUE_STOP_REASON_SKB_ADD); + IEEE80211_QUEUE_STOP_REASON_SKB_ADD, + false); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, - enum queue_stop_reason reason) + enum queue_stop_reason reason, + bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; @@ -477,7 +498,7 @@ void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) - __ieee80211_stop_queue(hw, i, reason); + __ieee80211_stop_queue(hw, i, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } @@ -485,7 +506,8 @@ void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, void ieee80211_stop_queues(struct ieee80211_hw *hw) { ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_DRIVER); + IEEE80211_QUEUE_STOP_REASON_DRIVER, + false); } EXPORT_SYMBOL(ieee80211_stop_queues); @@ -508,7 +530,8 @@ EXPORT_SYMBOL(ieee80211_queue_stopped); void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, unsigned long queues, - enum queue_stop_reason reason) + enum queue_stop_reason reason, + bool refcounted) { struct ieee80211_local *local = hw_to_local(hw); unsigned long flags; @@ -517,7 +540,7 @@ void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for_each_set_bit(i, &queues, hw->queues) - __ieee80211_wake_queue(hw, i, reason); + __ieee80211_wake_queue(hw, i, reason, refcounted); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } @@ -525,17 +548,16 @@ void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw, void ieee80211_wake_queues(struct ieee80211_hw *hw) { ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_DRIVER); + IEEE80211_QUEUE_STOP_REASON_DRIVER, + false); } EXPORT_SYMBOL(ieee80211_wake_queues); -void ieee80211_flush_queues(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) +static unsigned int +ieee80211_get_vif_queues(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) { - u32 queues; - - if (!local->ops->flush) - return; + unsigned int queues; if (sdata && local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) { int ac; @@ -551,13 +573,46 @@ void ieee80211_flush_queues(struct ieee80211_local *local, queues = BIT(local->hw.queues) - 1; } - ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_FLUSH); + return queues; +} + +void ieee80211_flush_queues(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + unsigned int queues; + + if (!local->ops->flush) + return; + + queues = ieee80211_get_vif_queues(local, sdata); + + ieee80211_stop_queues_by_reason(&local->hw, queues, + IEEE80211_QUEUE_STOP_REASON_FLUSH, + false); drv_flush(local, sdata, queues, false); - ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_FLUSH); + ieee80211_wake_queues_by_reason(&local->hw, queues, + IEEE80211_QUEUE_STOP_REASON_FLUSH, + false); +} + +void ieee80211_stop_vif_queues(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum queue_stop_reason reason) +{ + ieee80211_stop_queues_by_reason(&local->hw, + ieee80211_get_vif_queues(local, sdata), + reason, true); +} + +void ieee80211_wake_vif_queues(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum queue_stop_reason reason) +{ + ieee80211_wake_queues_by_reason(&local->hw, + ieee80211_get_vif_queues(local, sdata), + reason, true); } static void __iterate_active_interfaces(struct ieee80211_local *local, @@ -960,6 +1015,31 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, } elems->pwr_constr_elem = pos; break; + case WLAN_EID_CISCO_VENDOR_SPECIFIC: + /* Lots of different options exist, but we only care + * about the Dynamic Transmit Power Control element. + * First check for the Cisco OUI, then for the DTPC + * tag (0x00). + */ + if (elen < 4) { + elem_parse_failed = true; + break; + } + + if (pos[0] != 0x00 || pos[1] != 0x40 || + pos[2] != 0x96 || pos[3] != 0x00) + break; + + if (elen != 6) { + elem_parse_failed = true; + break; + } + + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + + elems->cisco_dtpc_elem = pos; + break; case WLAN_EID_TIMEOUT_INTERVAL: if (elen >= sizeof(struct ieee80211_timeout_interval_ie)) elems->timeout_int = (void *)pos; @@ -1166,14 +1246,17 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, } } -int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, - size_t buffer_len, const u8 *ie, size_t ie_len, - enum ieee80211_band band, u32 rate_mask, - struct cfg80211_chan_def *chandef) +static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, + u8 *buffer, size_t buffer_len, + const u8 *ie, size_t ie_len, + enum ieee80211_band band, + u32 rate_mask, + struct cfg80211_chan_def *chandef, + size_t *offset) { struct ieee80211_supported_band *sband; u8 *pos = buffer, *end = buffer + buffer_len; - size_t offset = 0, noffset; + size_t noffset; int supp_rates_len, i; u8 rates[32]; int num_rates; @@ -1181,6 +1264,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, int shift; u32 rate_flags; + *offset = 0; + sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband)) return 0; @@ -1219,12 +1304,12 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, noffset = ieee80211_ie_split(ie, ie_len, before_extrates, ARRAY_SIZE(before_extrates), - offset); - if (end - pos < noffset - offset) + *offset); + if (end - pos < noffset - *offset) goto out_err; - memcpy(pos, ie + offset, noffset - offset); - pos += noffset - offset; - offset = noffset; + memcpy(pos, ie + *offset, noffset - *offset); + pos += noffset - *offset; + *offset = noffset; } ext_rates_len = num_rates - supp_rates_len; @@ -1258,12 +1343,12 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, }; noffset = ieee80211_ie_split(ie, ie_len, before_ht, ARRAY_SIZE(before_ht), - offset); - if (end - pos < noffset - offset) + *offset); + if (end - pos < noffset - *offset) goto out_err; - memcpy(pos, ie + offset, noffset - offset); - pos += noffset - offset; - offset = noffset; + memcpy(pos, ie + *offset, noffset - *offset); + pos += noffset - *offset; + *offset = noffset; } if (sband->ht_cap.ht_supported) { @@ -1298,12 +1383,12 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, }; noffset = ieee80211_ie_split(ie, ie_len, before_vht, ARRAY_SIZE(before_vht), - offset); - if (end - pos < noffset - offset) + *offset); + if (end - pos < noffset - *offset) goto out_err; - memcpy(pos, ie + offset, noffset - offset); - pos += noffset - offset; - offset = noffset; + memcpy(pos, ie + *offset, noffset - *offset); + pos += noffset - *offset; + *offset = noffset; } if (sband->vht_cap.vht_supported) { @@ -1313,21 +1398,54 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, sband->vht_cap.cap); } - /* add any remaining custom IEs */ - if (ie && ie_len) { - noffset = ie_len; - if (end - pos < noffset - offset) - goto out_err; - memcpy(pos, ie + offset, noffset - offset); - pos += noffset - offset; - } - return pos - buffer; out_err: WARN_ONCE(1, "not enough space for preq IEs\n"); return pos - buffer; } +int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, + size_t buffer_len, + struct ieee80211_scan_ies *ie_desc, + const u8 *ie, size_t ie_len, + u8 bands_used, u32 *rate_masks, + struct cfg80211_chan_def *chandef) +{ + size_t pos = 0, old_pos = 0, custom_ie_offset = 0; + int i; + + memset(ie_desc, 0, sizeof(*ie_desc)); + + for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + if (bands_used & BIT(i)) { + pos += ieee80211_build_preq_ies_band(local, + buffer + pos, + buffer_len - pos, + ie, ie_len, i, + rate_masks[i], + chandef, + &custom_ie_offset); + ie_desc->ies[i] = buffer + old_pos; + ie_desc->len[i] = pos - old_pos; + old_pos = pos; + } + } + + /* add any remaining custom IEs */ + if (ie && ie_len) { + if (WARN_ONCE(buffer_len - pos < ie_len - custom_ie_offset, + "not enough space for preq custom IEs\n")) + return pos; + memcpy(buffer + pos, ie + custom_ie_offset, + ie_len - custom_ie_offset); + ie_desc->common_ies = buffer + pos; + ie_desc->common_ie_len = ie_len - custom_ie_offset; + pos += ie_len - custom_ie_offset; + } + + return pos; +}; + struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, u32 ratemask, struct ieee80211_channel *chan, @@ -1340,6 +1458,8 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb; struct ieee80211_mgmt *mgmt; int ies_len; + u32 rate_masks[IEEE80211_NUM_BANDS] = {}; + struct ieee80211_scan_ies dummy_ie_desc; /* * Do not send DS Channel parameter for directed probe requests @@ -1357,10 +1477,11 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, if (!skb) return NULL; + rate_masks[chan->band] = ratemask; ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb), - skb_tailroom(skb), - ie, ie_len, chan->band, - ratemask, &chandef); + skb_tailroom(skb), &dummy_ie_desc, + ie, ie_len, BIT(chan->band), + rate_masks, &chandef); skb_put(skb, ies_len); if (dst) { @@ -1604,7 +1725,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) if (local->use_chanctx) { mutex_lock(&local->chanctx_mtx); list_for_each_entry(ctx, &local->chanctx_list, list) - WARN_ON(drv_add_chanctx(local, ctx)); + if (ctx->replace_state != + IEEE80211_CHANCTX_REPLACES_OTHER) + WARN_ON(drv_add_chanctx(local, ctx)); mutex_unlock(&local->chanctx_mtx); list_for_each_entry(sdata, &local->interfaces, list) { @@ -1798,7 +1921,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) } ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, - IEEE80211_QUEUE_STOP_REASON_SUSPEND); + IEEE80211_QUEUE_STOP_REASON_SUSPEND, + false); /* * Reconfigure sched scan if it was interrupted by FW restart or @@ -2836,6 +2960,35 @@ void ieee80211_recalc_dtim(struct ieee80211_local *local, ps->dtim_count = dtim_count; } +static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) +{ + struct ieee80211_sub_if_data *sdata; + u8 radar_detect = 0; + + lockdep_assert_held(&local->chanctx_mtx); + + if (WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)) + return 0; + + list_for_each_entry(sdata, &ctx->reserved_vifs, reserved_chanctx_list) + if (sdata->reserved_radar_required) + radar_detect |= BIT(sdata->reserved_chandef.width); + + /* + * An in-place reservation context should not have any assigned vifs + * until it replaces the other context. + */ + WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER && + !list_empty(&ctx->assigned_vifs)); + + list_for_each_entry(sdata, &ctx->assigned_vifs, assigned_chanctx_list) + if (sdata->radar_required) + radar_detect |= BIT(sdata->vif.bss_conf.chandef.width); + + return radar_detect; +} + int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode chanmode, @@ -2877,8 +3030,9 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, num[iftype] = 1; list_for_each_entry(ctx, &local->chanctx_list, list) { - if (ctx->conf.radar_enabled) - radar_detect |= BIT(ctx->conf.def.width); + if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) + continue; + radar_detect |= ieee80211_chanctx_radar_detect(local, ctx); if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) { num_different_channels++; continue; @@ -2935,10 +3089,12 @@ int ieee80211_max_num_channels(struct ieee80211_local *local) lockdep_assert_held(&local->chanctx_mtx); list_for_each_entry(ctx, &local->chanctx_list, list) { + if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) + continue; + num_different_channels++; - if (ctx->conf.radar_enabled) - radar_detect |= BIT(ctx->conf.def.width); + radar_detect |= ieee80211_chanctx_radar_detect(local, ctx); } list_for_each_entry_rcu(sdata, &local->interfaces, list) @@ -2953,3 +3109,18 @@ int ieee80211_max_num_channels(struct ieee80211_local *local) return max_num_different_channels; } + +u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo) +{ + *buf++ = WLAN_EID_VENDOR_SPECIFIC; + *buf++ = 7; /* len */ + *buf++ = 0x00; /* Microsoft OUI 00:50:F2 */ + *buf++ = 0x50; + *buf++ = 0xf2; + *buf++ = 2; /* WME */ + *buf++ = 0; /* WME info */ + *buf++ = 1; /* WME ver */ + *buf++ = qosinfo; /* U-APSD no in use */ + + return buf; +} diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 9265adfdabfc..671ce0d27a80 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -129,6 +129,10 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; + /* don't support VHT for TDLS peers for now */ + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) + return; + /* * A VHT STA must support 40 MHz, but if we verify that here * then we break a few things - some APs (e.g. Netgear R6300v2 diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index 6ee2b5863572..9181fb6d6437 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -271,22 +271,6 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local, return ret; } - -static bool ieee80211_wep_is_weak_iv(struct sk_buff *skb, - struct ieee80211_key *key) -{ - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; - unsigned int hdrlen; - u8 *ivpos; - u32 iv; - - hdrlen = ieee80211_hdrlen(hdr->frame_control); - ivpos = skb->data + hdrlen; - iv = (ivpos[0] << 16) | (ivpos[1] << 8) | ivpos[2]; - - return ieee80211_wep_weak_iv(iv, key->conf.keylen); -} - ieee80211_rx_result ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx) { @@ -301,16 +285,12 @@ ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx) if (!(status->flag & RX_FLAG_DECRYPTED)) { if (skb_linearize(rx->skb)) return RX_DROP_UNUSABLE; - if (rx->sta && ieee80211_wep_is_weak_iv(rx->skb, rx->key)) - rx->sta->wep_weak_iv_count++; if (ieee80211_wep_decrypt(rx->local, rx->skb, rx->key)) return RX_DROP_UNUSABLE; } else if (!(status->flag & RX_FLAG_IV_STRIPPED)) { if (!pskb_may_pull(rx->skb, ieee80211_hdrlen(fc) + IEEE80211_WEP_IV_LEN)) return RX_DROP_UNUSABLE; - if (rx->sta && ieee80211_wep_is_weak_iv(rx->skb, rx->key)) - rx->sta->wep_weak_iv_count++; ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key); /* remove ICV */ if (pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index d51422c778de..3b873989992c 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -1,5 +1,6 @@ /* * Copyright 2004, Instant802 Networks, Inc. + * Copyright 2013-2014 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -118,7 +119,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_AP_VLAN: sta = rcu_dereference(sdata->u.vlan.sta); if (sta) { - qos = test_sta_flag(sta, WLAN_STA_WME); + qos = sta->sta.wme; break; } case NL80211_IFTYPE_AP: @@ -145,7 +146,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, if (!sta && ra && !is_multicast_ether_addr(ra)) { sta = sta_info_get(sdata, ra); if (sta) - qos = test_sta_flag(sta, WLAN_STA_WME); + qos = sta->sta.wme; } rcu_read_unlock(); diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 9b3dcc201145..983527a4c1ab 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -64,8 +64,11 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx) if (!info->control.hw_key) tail += IEEE80211_TKIP_ICV_LEN; - if (WARN_ON(skb_tailroom(skb) < tail || - skb_headroom(skb) < IEEE80211_TKIP_IV_LEN)) + if (WARN(skb_tailroom(skb) < tail || + skb_headroom(skb) < IEEE80211_TKIP_IV_LEN, + "mmic: not enough head/tail (%d/%d,%d/%d)\n", + skb_headroom(skb), IEEE80211_TKIP_IV_LEN, + skb_tailroom(skb), tail)) return TX_DROP; key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY]; @@ -811,7 +814,7 @@ ieee80211_crypto_hw_encrypt(struct ieee80211_tx_data *tx) ieee80211_rx_result ieee80211_crypto_hw_decrypt(struct ieee80211_rx_data *rx) { - if (rx->sta->cipher_scheme) + if (rx->sta && rx->sta->cipher_scheme) return ieee80211_crypto_cs_decrypt(rx); return RX_DROP_UNUSABLE; diff --git a/net/mac802154/ieee802154_dev.c b/net/mac802154/ieee802154_dev.c index 2cf66d885e68..b36b2b996578 100644 --- a/net/mac802154/ieee802154_dev.c +++ b/net/mac802154/ieee802154_dev.c @@ -143,6 +143,7 @@ static void mac802154_del_iface(struct wpan_phy *phy, struct net_device *dev) { struct mac802154_sub_if_data *sdata; + ASSERT_RTNL(); sdata = netdev_priv(dev); @@ -166,11 +167,13 @@ mac802154_add_iface(struct wpan_phy *phy, const char *name, int type) switch (type) { case IEEE802154_DEV_MONITOR: dev = alloc_netdev(sizeof(struct mac802154_sub_if_data), - name, mac802154_monitor_setup); + name, NET_NAME_UNKNOWN, + mac802154_monitor_setup); break; case IEEE802154_DEV_WPAN: dev = alloc_netdev(sizeof(struct mac802154_sub_if_data), - name, mac802154_wpan_setup); + name, NET_NAME_UNKNOWN, + mac802154_wpan_setup); break; default: dev = NULL; @@ -276,7 +279,8 @@ ieee802154_alloc_device(size_t priv_data_len, struct ieee802154_ops *ops) } priv = wpan_phy_priv(phy); - priv->hw.phy = priv->phy = phy; + priv->phy = phy; + priv->hw.phy = priv->phy; priv->hw.priv = (char *)priv + ALIGN(sizeof(*priv), NETDEV_ALIGN); priv->ops = ops; @@ -302,29 +306,61 @@ EXPORT_SYMBOL(ieee802154_free_device); int ieee802154_register_device(struct ieee802154_dev *dev) { struct mac802154_priv *priv = mac802154_to_priv(dev); - int rc = -ENOMEM; + int rc = -ENOSYS; + + if (dev->flags & IEEE802154_HW_TXPOWER) { + if (!priv->ops->set_txpower) + goto out; + + priv->phy->set_txpower = mac802154_set_txpower; + } + + if (dev->flags & IEEE802154_HW_LBT) { + if (!priv->ops->set_lbt) + goto out; + + priv->phy->set_lbt = mac802154_set_lbt; + } + + if (dev->flags & IEEE802154_HW_CCA_MODE) { + if (!priv->ops->set_cca_mode) + goto out; + + priv->phy->set_cca_mode = mac802154_set_cca_mode; + } + + if (dev->flags & IEEE802154_HW_CCA_ED_LEVEL) { + if (!priv->ops->set_cca_ed_level) + goto out; + + priv->phy->set_cca_ed_level = mac802154_set_cca_ed_level; + } + + if (dev->flags & IEEE802154_HW_CSMA_PARAMS) { + if (!priv->ops->set_csma_params) + goto out; + + priv->phy->set_csma_params = mac802154_set_csma_params; + } + + if (dev->flags & IEEE802154_HW_FRAME_RETRIES) { + if (!priv->ops->set_frame_retries) + goto out; + + priv->phy->set_frame_retries = mac802154_set_frame_retries; + } priv->dev_workqueue = create_singlethread_workqueue(wpan_phy_name(priv->phy)); - if (!priv->dev_workqueue) + if (!priv->dev_workqueue) { + rc = -ENOMEM; goto out; + } wpan_phy_set_dev(priv->phy, priv->hw.parent); priv->phy->add_iface = mac802154_add_iface; priv->phy->del_iface = mac802154_del_iface; - if (priv->ops->set_txpower) - priv->phy->set_txpower = mac802154_set_txpower; - if (priv->ops->set_lbt) - priv->phy->set_lbt = mac802154_set_lbt; - if (priv->ops->set_cca_mode) - priv->phy->set_cca_mode = mac802154_set_cca_mode; - if (priv->ops->set_cca_ed_level) - priv->phy->set_cca_ed_level = mac802154_set_cca_ed_level; - if (priv->ops->set_csma_params) - priv->phy->set_csma_params = mac802154_set_csma_params; - if (priv->ops->set_frame_retries) - priv->phy->set_frame_retries = mac802154_set_frame_retries; rc = wpan_phy_register(priv->phy); if (rc < 0) diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index 1456f73b02b9..457058142098 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -538,6 +538,7 @@ static int llsec_recover_addr(struct mac802154_llsec *sec, struct ieee802154_addr *addr) { __le16 caddr = sec->params.coord_shortaddr; + addr->pan_id = sec->params.pan_id; if (caddr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) { diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index 15aa2f2b03a7..868a040fd422 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -175,9 +175,9 @@ static void phy_chan_notify(struct work_struct *work) mutex_lock(&priv->hw->phy->pib_lock); res = hw->ops->set_channel(&hw->hw, priv->page, priv->chan); - if (res) + if (res) { pr_debug("set_channel failed\n"); - else { + } else { priv->hw->phy->current_channel = priv->chan; priv->hw->phy->current_page = priv->page; } @@ -210,8 +210,9 @@ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan) INIT_WORK(&work->work, phy_chan_notify); work->dev = dev; queue_work(priv->hw->dev_workqueue, &work->work); - } else + } else { mutex_unlock(&priv->hw->phy->pib_lock); + } } diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 7f820a108a9c..a14cf9ede171 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -86,9 +86,8 @@ fail: static void mac802154_rx_worker(struct work_struct *work) { struct rx_work *rw = container_of(work, struct rx_work, work); - struct sk_buff *skb = rw->skb; - mac802154_subif_rx(rw->dev, skb, rw->lqi); + mac802154_subif_rx(rw->dev, rw->skb, rw->lqi); kfree(rw); } @@ -101,7 +100,7 @@ ieee802154_rx_irqsafe(struct ieee802154_dev *dev, struct sk_buff *skb, u8 lqi) if (!skb) return; - work = kzalloc(sizeof(struct rx_work), GFP_ATOMIC); + work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) return; diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 6d1647399d4f..fdf4c0e67259 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -89,8 +89,7 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb, if (!(priv->phy->channels_supported[page] & (1 << chan))) { WARN_ON(1); - kfree_skb(skb); - return NETDEV_TX_OK; + goto err_tx; } mac802154_monitors_rx(mac802154_to_priv(&priv->hw), skb); @@ -98,16 +97,15 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb, if (!(priv->hw.flags & IEEE802154_HW_OMIT_CKSUM)) { u16 crc = crc_ccitt(0, skb->data, skb->len); u8 *data = skb_put(skb, 2); + data[0] = crc & 0xff; data[1] = crc >> 8; } - if (skb_cow_head(skb, priv->hw.extra_tx_headroom)) { - kfree_skb(skb); - return NETDEV_TX_OK; - } + if (skb_cow_head(skb, priv->hw.extra_tx_headroom)) + goto err_tx; - work = kzalloc(sizeof(struct xmit_work), GFP_ATOMIC); + work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { kfree_skb(skb); return NETDEV_TX_BUSY; @@ -128,4 +126,8 @@ netdev_tx_t mac802154_tx(struct mac802154_priv *priv, struct sk_buff *skb, queue_work(priv->dev_workqueue, &work->work); return NETDEV_TX_OK; + +err_tx: + kfree_skb(skb); + return NETDEV_TX_OK; } diff --git a/net/mac802154/wpan.c b/net/mac802154/wpan.c index 3c3069fd6971..4ab86a57dca5 100644 --- a/net/mac802154/wpan.c +++ b/net/mac802154/wpan.c @@ -90,7 +90,7 @@ mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } case SIOCSIFADDR: dev_warn(&dev->dev, - "Using DEBUGing ioctl SIOCSIFADDR isn't recommened!\n"); + "Using DEBUGing ioctl SIOCSIFADDR isn't recommended!\n"); if (sa->family != AF_IEEE802154 || sa->addr.addr_type != IEEE802154_ADDR_SHORT || sa->addr.pan_id == IEEE802154_PANID_BROADCAST || @@ -462,7 +462,10 @@ mac802154_subif_frame(struct mac802154_sub_if_data *sdata, struct sk_buff *skb, skb->pkt_type = PACKET_OTHERHOST; break; default: - break; + spin_unlock_bh(&sdata->mib_lock); + pr_debug("invalid dest mode\n"); + kfree_skb(skb); + return NET_RX_DROP; } spin_unlock_bh(&sdata->mib_lock); @@ -472,8 +475,7 @@ mac802154_subif_frame(struct mac802154_sub_if_data *sdata, struct sk_buff *skb, rc = mac802154_llsec_decrypt(&sdata->sec, skb); if (rc) { pr_debug("decryption failed: %i\n", rc); - kfree_skb(skb); - return NET_RX_DROP; + goto fail; } sdata->dev->stats.rx_packets++; @@ -485,9 +487,12 @@ mac802154_subif_frame(struct mac802154_sub_if_data *sdata, struct sk_buff *skb, default: pr_warn("ieee802154: bad frame received (type = %d)\n", mac_cb(skb)->type); - kfree_skb(skb); - return NET_RX_DROP; + goto fail; } + +fail: + kfree_skb(skb); + return NET_RX_DROP; } static void mac802154_print_addr(const char *name, @@ -573,6 +578,7 @@ void mac802154_wpans_rx(struct mac802154_priv *priv, struct sk_buff *skb) ret = mac802154_parse_frame_start(skb, &hdr); if (ret) { pr_debug("got invalid frame\n"); + kfree_skb(skb); return; } diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 6b38d083e1c9..e28ed2ef5b06 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -65,15 +65,9 @@ out: return segs; } -static int mpls_gso_send_check(struct sk_buff *skb) -{ - return 0; -} - static struct packet_offload mpls_mc_offload = { .type = cpu_to_be16(ETH_P_MPLS_MC), .callbacks = { - .gso_send_check = mpls_gso_send_check, .gso_segment = mpls_gso_segment, }, }; @@ -81,7 +75,6 @@ static struct packet_offload mpls_mc_offload = { static struct packet_offload mpls_uc_offload = { .type = cpu_to_be16(ETH_P_MPLS_UC), .callbacks = { - .gso_send_check = mpls_gso_send_check, .gso_segment = mpls_gso_segment, }, }; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e9410d17619d..ae5096ab65eb 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -46,6 +46,9 @@ config NF_CONNTRACK To compile it as a module, choose M here. If unsure, say N. +config NF_LOG_COMMON + tristate + if NF_CONNTRACK config NF_CONNTRACK_MARK @@ -493,10 +496,19 @@ config NFT_LIMIT This option adds the "limit" expression that you can use to ratelimit rule matchings. -config NFT_NAT +config NFT_MASQ depends on NF_TABLES depends on NF_CONNTRACK depends on NF_NAT + tristate "Netfilter nf_tables masquerade support" + help + This option adds the "masquerade" expression that you can use + to perform NAT in the masquerade flavour. + +config NFT_NAT + depends on NF_TABLES + depends on NF_CONNTRACK + select NF_NAT tristate "Netfilter nf_tables nat module" help This option adds the "nat" expression that you can use to perform @@ -744,6 +756,9 @@ config NETFILTER_XT_TARGET_LED config NETFILTER_XT_TARGET_LOG tristate "LOG target support" + select NF_LOG_COMMON + select NF_LOG_IPV4 + select NF_LOG_IPV6 if IPV6 default m if NETFILTER_ADVANCED=n help This option adds a `LOG' target, which allows you to create rules in @@ -760,6 +775,14 @@ config NETFILTER_XT_TARGET_MARK (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). +config NETFILTER_XT_NAT + tristate '"SNAT and DNAT" targets support' + depends on NF_NAT + ---help--- + This option enables the SNAT and DNAT targets. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_NETMAP tristate '"NETMAP" target support' depends on NF_NAT @@ -833,6 +856,7 @@ config NETFILTER_XT_TARGET_TPROXY tristate '"TPROXY" target transparent proxying support' depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED + depends on (IPV6 || IPV6=n) depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index bffdad774da7..a9571be3f791 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -47,6 +47,9 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \ nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o +# generic transport layer logging +obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o + obj-$(CONFIG_NF_NAT) += nf_nat.o # NAT protocols (nf_nat) @@ -84,6 +87,7 @@ obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o +obj-$(CONFIG_NFT_MASQ) += nft_masq.o # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o @@ -92,7 +96,7 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o -obj-$(CONFIG_NF_NAT) += xt_nat.o +obj-$(CONFIG_NETFILTER_XT_NAT) += xt_nat.o # targets obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 1fbab0cdd302..024a2e25c8a4 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -35,11 +35,7 @@ EXPORT_SYMBOL_GPL(nf_ipv6_ops); int nf_register_afinfo(const struct nf_afinfo *afinfo) { - int err; - - err = mutex_lock_interruptible(&afinfo_mutex); - if (err < 0) - return err; + mutex_lock(&afinfo_mutex); RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo); mutex_unlock(&afinfo_mutex); return 0; @@ -58,7 +54,7 @@ EXPORT_SYMBOL_GPL(nf_unregister_afinfo); struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); -#if defined(CONFIG_JUMP_LABEL) +#ifdef HAVE_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif @@ -68,18 +64,15 @@ static DEFINE_MUTEX(nf_hook_mutex); int nf_register_hook(struct nf_hook_ops *reg) { struct nf_hook_ops *elem; - int err; - err = mutex_lock_interruptible(&nf_hook_mutex); - if (err < 0) - return err; + mutex_lock(&nf_hook_mutex); list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) { if (reg->priority < elem->priority) break; } list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); -#if defined(CONFIG_JUMP_LABEL) +#ifdef HAVE_JUMP_LABEL static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0; @@ -91,7 +84,7 @@ void nf_unregister_hook(struct nf_hook_ops *reg) mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); -#if defined(CONFIG_JUMP_LABEL) +#ifdef HAVE_JUMP_LABEL static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index 2f7f5c32c6f9..234a8ec82076 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -99,6 +99,15 @@ config IP_SET_HASH_IPPORTNET To compile it as a module, choose M here. If unsure, say N. +config IP_SET_HASH_MAC + tristate "hash:mac set support" + depends on IP_SET + help + This option adds the hash:mac set type support, by which + one can store MAC (ethernet address) elements in a set. + + To compile it as a module, choose M here. If unsure, say N. + config IP_SET_HASH_NETPORTNET tristate "hash:net,port,net set support" depends on IP_SET diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile index 231f10196cb9..3dbd5e958489 100644 --- a/net/netfilter/ipset/Makefile +++ b/net/netfilter/ipset/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o +obj-$(CONFIG_IP_SET_HASH_MAC) += ip_set_hash_mac.o obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index f2c7d83dc23f..6f024a8a1534 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -128,6 +128,8 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, return 0; if (SET_WITH_COUNTER(set)) ip_set_update_counter(ext_counter(x, set), ext, mext, flags); + if (SET_WITH_SKBINFO(set)) + ip_set_get_skbinfo(ext_skbinfo(x, set), ext, mext, flags); return 1; } @@ -161,6 +163,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_init_counter(ext_counter(x, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(x, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(x, set), ext); return 0; } diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 6f1f9f494808..55b083ec587a 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -27,7 +27,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ +/* 2 Comment support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -112,7 +113,7 @@ bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb, { struct bitmap_ip *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_ip_adt_elem e = { }; + struct bitmap_ip_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; @@ -132,14 +133,17 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], struct bitmap_ip *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; u32 ip = 0, ip_to = 0; - struct bitmap_ip_adt_elem e = { }; + struct bitmap_ip_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret = 0; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -357,6 +361,9 @@ static struct ip_set_type bitmap_ip_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 740eabededd9..86104744b00f 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -27,7 +27,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ +/* 2 Comment support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -203,7 +204,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_ipmac_adt_elem e = {}; + struct bitmap_ipmac_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; @@ -232,7 +233,7 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], { const struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_ipmac_adt_elem e = {}; + struct bitmap_ipmac_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0; int ret = 0; @@ -240,7 +241,10 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -394,6 +398,9 @@ static struct ip_set_type bitmap_ipmac_type = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index cf99676e69f8..005dd36444c3 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -22,7 +22,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comment support added */ +/* 2 Comment support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -104,7 +105,7 @@ bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_port_adt_elem e = {}; + struct bitmap_port_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); __be16 __port; u16 port = 0; @@ -129,7 +130,7 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], { struct bitmap_port *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_port_adt_elem e = {}; + struct bitmap_port_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port; /* wraparound */ u16 port_to; @@ -139,7 +140,10 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -291,6 +295,9 @@ static struct ip_set_type bitmap_port_type = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index ec8114fae50b..912e5a05b79d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -101,7 +101,7 @@ load_settype(const char *name) nfnl_unlock(NFNL_SUBSYS_IPSET); pr_debug("try to load ip_set_%s\n", name); if (request_module("ip_set_%s", name) < 0) { - pr_warning("Can't find ip_set type %s\n", name); + pr_warn("Can't find ip_set type %s\n", name); nfnl_lock(NFNL_SUBSYS_IPSET); return false; } @@ -195,20 +195,19 @@ ip_set_type_register(struct ip_set_type *type) int ret = 0; if (type->protocol != IPSET_PROTOCOL) { - pr_warning("ip_set type %s, family %s, revision %u:%u uses " - "wrong protocol version %u (want %u)\n", - type->name, family_name(type->family), - type->revision_min, type->revision_max, - type->protocol, IPSET_PROTOCOL); + pr_warn("ip_set type %s, family %s, revision %u:%u uses wrong protocol version %u (want %u)\n", + type->name, family_name(type->family), + type->revision_min, type->revision_max, + type->protocol, IPSET_PROTOCOL); return -EINVAL; } ip_set_type_lock(); if (find_set_type(type->name, type->family, type->revision_min)) { /* Duplicate! */ - pr_warning("ip_set type %s, family %s with revision min %u " - "already registered!\n", type->name, - family_name(type->family), type->revision_min); + pr_warn("ip_set type %s, family %s with revision min %u already registered!\n", + type->name, family_name(type->family), + type->revision_min); ret = -EINVAL; goto unlock; } @@ -228,9 +227,9 @@ ip_set_type_unregister(struct ip_set_type *type) { ip_set_type_lock(); if (!find_set_type(type->name, type->family, type->revision_min)) { - pr_warning("ip_set type %s, family %s with revision min %u " - "not registered\n", type->name, - family_name(type->family), type->revision_min); + pr_warn("ip_set type %s, family %s with revision min %u not registered\n", + type->name, family_name(type->family), + type->revision_min); goto unlock; } list_del_rcu(&type->list); @@ -338,6 +337,12 @@ const struct ip_set_ext_type ip_set_extensions[] = { .len = sizeof(unsigned long), .align = __alignof__(unsigned long), }, + [IPSET_EXT_ID_SKBINFO] = { + .type = IPSET_EXT_SKBINFO, + .flag = IPSET_FLAG_WITH_SKBINFO, + .len = sizeof(struct ip_set_skbinfo), + .align = __alignof__(struct ip_set_skbinfo), + }, [IPSET_EXT_ID_COMMENT] = { .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, .flag = IPSET_FLAG_WITH_COMMENT, @@ -383,6 +388,7 @@ int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { + u64 fullmark; if (tb[IPSET_ATTR_TIMEOUT]) { if (!(set->extensions & IPSET_EXT_TIMEOUT)) return -IPSET_ERR_TIMEOUT; @@ -403,7 +409,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } - + if (tb[IPSET_ATTR_SKBMARK]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); + ext->skbmark = fullmark >> 32; + ext->skbmarkmask = fullmark & 0xffffffff; + } + if (tb[IPSET_ATTR_SKBPRIO]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbprio = be32_to_cpu(nla_get_be32( + tb[IPSET_ATTR_SKBPRIO])); + } + if (tb[IPSET_ATTR_SKBQUEUE]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbqueue = be16_to_cpu(nla_get_be16( + tb[IPSET_ATTR_SKBQUEUE])); + } return 0; } EXPORT_SYMBOL_GPL(ip_set_get_extensions); @@ -478,7 +502,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, if (ret == -EAGAIN) { /* Type requests element to be completed */ - pr_debug("element must be competed, ADD is triggered\n"); + pr_debug("element must be completed, ADD is triggered\n"); write_lock_bh(&set->lock); set->variant->kadt(set, skb, par, IPSET_ADD, opt); write_unlock_bh(&set->lock); @@ -1398,7 +1422,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb); struct sk_buff *skb2; struct nlmsgerr *errmsg; - size_t payload = sizeof(*errmsg) + nlmsg_len(nlh); + size_t payload = min(SIZE_MAX, + sizeof(*errmsg) + nlmsg_len(nlh)); int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; struct nlattr *cmdattr; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 61c7fb052802..fee7c64e4dd1 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -565,8 +565,8 @@ retry: set->name, orig->htable_bits, htable_bits, orig); if (!htable_bits) { /* In case we have plenty of memory :-) */ - pr_warning("Cannot increase the hashsize of set %s further\n", - set->name); + pr_warn("Cannot increase the hashsize of set %s further\n", + set->name); return -IPSET_ERR_HASH_FULL; } t = ip_set_alloc(sizeof(*t) @@ -651,8 +651,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (h->elements >= h->maxelem) { if (net_ratelimit()) - pr_warning("Set %s is full, maxelem %u reached\n", - set->name, h->maxelem); + pr_warn("Set %s is full, maxelem %u reached\n", + set->name, h->maxelem); return -IPSET_ERR_HASH_FULL; } @@ -720,6 +720,8 @@ reuse_slot: ip_set_init_counter(ext_counter(data, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(data, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(data, set), ext); out: rcu_read_unlock_bh(); @@ -797,6 +799,9 @@ mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext, if (SET_WITH_COUNTER(set)) ip_set_update_counter(ext_counter(data, set), ext, mext, flags); + if (SET_WITH_SKBINFO(set)) + ip_set_get_skbinfo(ext_skbinfo(data, set), + ext, mext, flags); return mtype_do_data_match(data); } @@ -998,8 +1003,8 @@ mtype_list(const struct ip_set *set, nla_put_failure: nlmsg_trim(skb, incomplete); if (unlikely(first == cb->args[IPSET_CB_ARG0])) { - pr_warning("Can't list set %s: one bucket does not fit into " - "a message. Please report it!\n", set->name); + pr_warn("Can't list set %s: one bucket does not fit into a message. Please report it!\n", + set->name); cb->args[IPSET_CB_ARG0] = 0; return -EMSGSIZE; } @@ -1049,8 +1054,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, struct HTYPE *h; struct htable *t; +#ifndef IP_SET_PROTO_UNDEF if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) return -IPSET_ERR_INVALID_FAMILY; +#endif #ifdef IP_SET_HASH_WITH_MARKMASK markmask = 0xffffffff; @@ -1093,7 +1100,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (tb[IPSET_ATTR_MARKMASK]) { markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK])); - if ((markmask > 4294967295u) || markmask == 0) + if (markmask == 0) return -IPSET_ERR_INVALID_MARKMASK; } #endif @@ -1132,25 +1139,32 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, rcu_assign_pointer(h->table, t); set->data = h; +#ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) { +#endif set->variant = &IPSET_TOKEN(HTYPE, 4_variant); set->dsize = ip_set_elem_len(set, tb, sizeof(struct IPSET_TOKEN(HTYPE, 4_elem))); +#ifndef IP_SET_PROTO_UNDEF } else { set->variant = &IPSET_TOKEN(HTYPE, 6_variant); set->dsize = ip_set_elem_len(set, tb, sizeof(struct IPSET_TOKEN(HTYPE, 6_elem))); } +#endif if (tb[IPSET_ATTR_TIMEOUT]) { set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); +#ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) +#endif IPSET_TOKEN(HTYPE, 4_gc_init)(set, IPSET_TOKEN(HTYPE, 4_gc)); +#ifndef IP_SET_PROTO_UNDEF else IPSET_TOKEN(HTYPE, 6_gc_init)(set, IPSET_TOKEN(HTYPE, 6_gc)); +#endif } - pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", set->name, jhash_size(t->htable_bits), t->htable_bits, h->maxelem, set->data, t); diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index dd40607f878e..76959d79e9d1 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -26,7 +26,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support */ /* 2 Comments support */ -#define IPSET_TYPE_REV_MAX 3 /* Forceadd support */ +/* 3 Forceadd support */ +#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -84,7 +85,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, { const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ip4_elem e = {}; + struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); __be32 ip; @@ -103,7 +104,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ip4_elem e = {}; + struct hash_ip4_elem e = { 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, hosts; int ret = 0; @@ -111,7 +112,10 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -222,7 +226,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, { const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ip6_elem e = {}; + struct hash_ip6_elem e = { { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); @@ -239,7 +243,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ip6_elem e = {}; + struct hash_ip6_elem e = { { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; @@ -247,6 +251,9 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -295,6 +302,9 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 4eff0a297254..7abf9788cfa8 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -25,7 +25,8 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 1 /* Forceadd support */ +/* 1 Forceadd support */ +#define IPSET_TYPE_REV_MAX 2 /* skbinfo support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); @@ -113,7 +114,10 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -244,6 +248,9 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -301,6 +308,9 @@ static struct ip_set_type hash_ipmark_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 7597b82a8b03..dcbcceb9a52f 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -28,7 +28,8 @@ /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ -#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ +/* 4 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -94,7 +95,7 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport4_elem e = { }; + struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, @@ -111,7 +112,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport4_elem e = { }; + struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to; bool with_ports = false; @@ -122,7 +123,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -258,7 +262,7 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport6_elem e = { }; + struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, @@ -275,7 +279,7 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ipport *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipport6_elem e = { }; + struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; @@ -287,6 +291,9 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -370,6 +377,9 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 672655ffd573..7ef93fc887a1 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -28,7 +28,8 @@ /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ -#define IPSET_TYPE_REV_MAX 4 /* Forceadd support added */ +/* 4 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -95,7 +96,7 @@ hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip4_elem e = { }; + struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, @@ -113,7 +114,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip4_elem e = { }; + struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to; bool with_ports = false; @@ -124,7 +125,10 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -265,7 +269,7 @@ hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip6_elem e = { }; + struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, @@ -283,7 +287,7 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_ipportip *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportip6_elem e = { }; + struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; @@ -295,6 +299,9 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -382,6 +389,9 @@ static struct ip_set_type hash_ipportip_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 7308d84f9277..b6012ad92781 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -30,7 +30,8 @@ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ -#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ +/* 6 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -179,7 +180,10 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -432,6 +436,9 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) || tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) return -IPSET_ERR_PROTOCOL; @@ -541,6 +548,9 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c new file mode 100644 index 000000000000..65690b52a4d5 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -0,0 +1,173 @@ +/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:mac type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/if_ether.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +#define IPSET_TYPE_REV_MIN 0 +#define IPSET_TYPE_REV_MAX 0 + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); +MODULE_ALIAS("ip_set_hash:mac"); + +/* Type specific function prefix */ +#define HTYPE hash_mac + +/* Member elements */ +struct hash_mac4_elem { + /* Zero valued IP addresses cannot be stored */ + union { + unsigned char ether[ETH_ALEN]; + __be32 foo[2]; + }; +}; + +/* Common functions */ + +static inline bool +hash_mac4_data_equal(const struct hash_mac4_elem *e1, + const struct hash_mac4_elem *e2, + u32 *multi) +{ + return ether_addr_equal(e1->ether, e2->ether); +} + +static inline bool +hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) +{ + return nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether); +} + +static inline void +hash_mac4_data_next(struct hash_mac4_elem *next, + const struct hash_mac4_elem *e) +{ +} + +#define MTYPE hash_mac4 +#define PF 4 +#define HOST_MASK 32 +#define IP_SET_EMIT_CREATE +#define IP_SET_PROTO_UNDEF +#include "ip_set_hash_gen.h" + +/* Zero valued element is not supported */ +static const unsigned char invalid_ether[ETH_ALEN] = { 0 }; + +static int +hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, struct ip_set_adt_opt *opt) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; + struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + + /* MAC can be src only */ + if (!(opt->flags & IPSET_DIM_ONE_SRC)) + return 0; + + if (skb_mac_header(skb) < skb->head || + (skb_mac_header(skb) + ETH_HLEN) > skb->data) + return -EINVAL; + + memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); + if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); +} + +static int +hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; + struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + int ret; + + if (unlikely(!tb[IPSET_ATTR_ETHER] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_extensions(set, tb, &ext); + if (ret) + return ret; + memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); + if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) + return -IPSET_ERR_HASH_ELEM; + + return adtfn(set, &e, &ext, &ext, flags); +} + +static struct ip_set_type hash_mac_type __read_mostly = { + .name = "hash:mac", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_MAC, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_UNSPEC, + .revision_min = IPSET_TYPE_REV_MIN, + .revision_max = IPSET_TYPE_REV_MAX, + .create = hash_mac_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, + .len = ETH_ALEN }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, + [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, + [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_mac_init(void) +{ + return ip_set_type_register(&hash_mac_type); +} + +static void __exit +hash_mac_fini(void) +{ + ip_set_type_unregister(&hash_mac_type); +} + +module_init(hash_mac_init); +module_exit(hash_mac_fini); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 4c7d495783a3..6b3ac10ac2f1 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -27,7 +27,8 @@ /* 2 nomatch flag support added */ /* 3 Counters support added */ /* 4 Comments support added */ -#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ +/* 5 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -150,7 +151,10 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -318,7 +322,10 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -377,6 +384,9 @@ static struct ip_set_type hash_net_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index db2606805b35..35dd35873442 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -28,7 +28,8 @@ /* 2 /0 support added */ /* 3 Counters support added */ /* 4 Comments support added */ -#define IPSET_TYPE_REV_MAX 5 /* Forceadd support added */ +/* 5 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -236,7 +237,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, #define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC) if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const struct nf_bridge_info *nf_bridge = skb->nf_bridge; if (!nf_bridge) @@ -281,7 +282,10 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -470,7 +474,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, ip6_netmask(&e.ip, e.cidr); if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const struct nf_bridge_info *nf_bridge = skb->nf_bridge; if (!nf_bridge) @@ -514,7 +518,10 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -590,6 +597,9 @@ static struct ip_set_type hash_netiface_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 3e99987e4bf2..da00284b3571 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -24,7 +24,8 @@ #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 -#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ +/* 1 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -171,7 +172,10 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -203,7 +207,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], flags |= (IPSET_FLAG_NOMATCH << 16); } - if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] && + if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) { e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); @@ -219,9 +223,10 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; if (ip_to < ip) swap(ip, ip_to); - if (ip + UINT_MAX == ip_to) + if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } + } else + ip_set_mask_from_to(ip, ip_to, e.cidr[0]); ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { @@ -230,10 +235,10 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; if (ip2_to < ip2_from) swap(ip2_from, ip2_to); - if (ip2_from + UINT_MAX == ip2_to) + if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - - } + } else + ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); if (retried) ip = ntohl(h->next.ip[0]); @@ -393,7 +398,10 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -461,6 +469,9 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 1c645fbd09c7..c0ddb58d19dc 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -29,7 +29,8 @@ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ -#define IPSET_TYPE_REV_MAX 6 /* Forceadd support added */ +/* 6 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -172,7 +173,10 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -389,7 +393,10 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -489,6 +496,9 @@ static struct ip_set_type hash_netport_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index c0d2ba73f8b2..b8053d675fc3 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -26,7 +26,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 0 Comments support added */ -#define IPSET_TYPE_REV_MAX 1 /* Forceadd support added */ +/* 1 Forceadd support added */ +#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); @@ -189,7 +190,10 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -257,7 +261,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; - } + } else + ip_set_mask_from_to(ip, ip_to, e.cidr[0]); port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { @@ -275,7 +280,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; - } + } else + ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); if (retried) ip = ntohl(h->next.ip[0]); @@ -458,7 +464,10 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; @@ -567,6 +576,9 @@ static struct ip_set_type hash_netportnet_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 3e2317f3cf68..f8f682806e36 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -17,7 +17,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support added */ -#define IPSET_TYPE_REV_MAX 2 /* Comments support added */ +/* 2 Comments support added */ +#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); @@ -73,6 +74,10 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb, ip_set_update_counter(ext_counter(e, set), ext, &opt->ext, cmdflags); + if (SET_WITH_SKBINFO(set)) + ip_set_get_skbinfo(ext_skbinfo(e, set), + ext, &opt->ext, + cmdflags); return ret; } } @@ -197,6 +202,8 @@ list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d, ip_set_init_counter(ext_counter(e, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(e, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(e, set), ext); return 0; } @@ -307,6 +314,8 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_init_counter(ext_counter(e, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(ext_comment(e, set), ext); + if (SET_WITH_SKBINFO(set)) + ip_set_init_skbinfo(ext_skbinfo(e, set), ext); /* Set is already added to the list */ ip_set_put_byindex(map->net, d->id); return 0; @@ -378,7 +387,10 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || - !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES))) + !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) @@ -597,7 +609,9 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) struct set_elem *e; u32 i; - map = kzalloc(sizeof(*map) + size * set->dsize, GFP_KERNEL); + map = kzalloc(sizeof(*map) + + min_t(u32, size, IP_SET_LIST_MAX_SIZE) * set->dsize, + GFP_KERNEL); if (!map) return false; @@ -665,6 +679,9 @@ static struct ip_set_type list_set_type __read_mostly = { [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, + [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, + [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, + [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 0c3b1670b0d1..3b6929dec748 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -152,6 +152,16 @@ config IP_VS_WLC If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_FO + tristate "weighted failover scheduling" + ---help--- + The weighted failover scheduling algorithm directs network + connections to the server with the highest weight that is + currently available. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_LBLC tristate "locality-based least-connection scheduling" ---help--- diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index 34ee602ddb66..38b2723b2e3d 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 610e19c0e13f..b0f7b626b56d 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -27,6 +27,7 @@ #include <linux/interrupt.h> #include <linux/in.h> +#include <linux/inet.h> #include <linux/net.h> #include <linux/kernel.h> #include <linux/module.h> @@ -77,6 +78,13 @@ static unsigned int ip_vs_conn_rnd __read_mostly; #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) +/* We need an addrstrlen that works with or without v6 */ +#ifdef CONFIG_IP_VS_IPV6 +#define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN +#else +#define IP_VS_ADDRSTRLEN (8+1) +#endif + struct ip_vs_aligned_lock { spinlock_t l; @@ -488,7 +496,12 @@ static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) break; case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + else +#endif + cp->packet_xmit = ip_vs_tunnel_xmit; break; case IP_VS_CONN_F_DROUTE: @@ -514,7 +527,10 @@ static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) break; case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit_v6; + if (cp->daf == AF_INET6) + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + else + cp->packet_xmit = ip_vs_tunnel_xmit; break; case IP_VS_CONN_F_DROUTE: @@ -580,7 +596,7 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) ip_vs_proto_name(cp->protocol), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, cp->flags, atomic_read(&cp->refcnt), atomic_read(&dest->refcnt)); @@ -616,7 +632,13 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp) struct ip_vs_dest *dest; rcu_read_lock(); - dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + + /* This function is only invoked by the synchronization code. We do + * not currently support heterogeneous pools with synchronization, + * so we can make the assumption that the svc_af is the same as the + * dest_af + */ + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr, cp->dport, &cp->vaddr, cp->vport, cp->protocol, cp->fwmark, cp->flags); if (dest) { @@ -671,7 +693,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) ip_vs_proto_name(cp->protocol), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, cp->flags, atomic_read(&cp->refcnt), atomic_read(&dest->refcnt)); @@ -740,7 +762,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct) ntohs(ct->cport), IP_VS_DBG_ADDR(ct->af, &ct->vaddr), ntohs(ct->vport), - IP_VS_DBG_ADDR(ct->af, &ct->daddr), + IP_VS_DBG_ADDR(ct->daf, &ct->daddr), ntohs(ct->dport)); /* @@ -848,7 +870,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) * Create a new connection entry and hash it into the ip_vs_conn_tab */ struct ip_vs_conn * -ip_vs_conn_new(const struct ip_vs_conn_param *p, +ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark) { @@ -867,6 +889,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); ip_vs_conn_net_set(cp, p->net); cp->af = p->af; + cp->daf = dest_af; cp->protocol = p->protocol; ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; @@ -874,7 +897,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, &cp->vaddr, p->vaddr); cp->vport = p->vport; - ip_vs_addr_set(p->af, &cp->daddr, daddr); + ip_vs_addr_set(cp->daf, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; @@ -1036,6 +1059,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) struct net *net = seq_file_net(seq); char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; size_t len = 0; + char dbuf[IP_VS_ADDRSTRLEN]; if (!ip_vs_conn_net_eq(cp, net)) return 0; @@ -1050,24 +1074,32 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) pe_data[len] = '\0'; #ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); + else +#endif + snprintf(dbuf, sizeof(dbuf), "%08X", + ntohl(cp->daddr.ip)); + +#ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " - "%pI6 %04X %-11s %7lu%s\n", + "%s %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), - &cp->daddr.in6, ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), (cp->timer.expires-jiffies)/HZ, pe_data); else #endif seq_printf(seq, "%-3s %08X %04X %08X %04X" - " %08X %04X %-11s %7lu%s\n", + " %s %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), (cp->timer.expires-jiffies)/HZ, pe_data); } @@ -1105,6 +1137,7 @@ static const char *ip_vs_origin_name(unsigned int flags) static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) { + char dbuf[IP_VS_ADDRSTRLEN]; if (v == SEQ_START_TOKEN) seq_puts(seq, @@ -1117,12 +1150,21 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) return 0; #ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); + else +#endif + snprintf(dbuf, sizeof(dbuf), "%08X", + ntohl(cp->daddr.ip)); + +#ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) - seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n", + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " + "%s %04X %-11s %-6s %7lu\n", ip_vs_proto_name(cp->protocol), &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), - &cp->daddr.in6, ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); @@ -1130,11 +1172,11 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) #endif seq_printf(seq, "%-3s %08X %04X %08X %04X " - "%08X %04X %-11s %-6s %7lu\n", + "%s %04X %-11s %-6s %7lu\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), - ntohl(cp->daddr.ip), ntohs(cp->dport), + dbuf, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index e6836755c45d..990decba1fe4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -328,7 +328,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * This adds param.pe_data to the template, * and thus param.pe_data will be destroyed * when the template expires */ - ct = ip_vs_conn_new(¶m, &dest->addr, dport, + ct = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, IP_VS_CONN_F_TEMPLATE, dest, skb->mark); if (ct == NULL) { kfree(param.pe_data); @@ -357,7 +357,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, src_port, &iph->daddr, dst_port, ¶m); - cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); + cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, + skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); *ignored = -1; @@ -479,7 +480,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); - cp = ip_vs_conn_new(&p, &dest->addr, + cp = ip_vs_conn_new(&p, dest->af, &dest->addr, dest->port ? dest->port : pptr[1], flags, dest, skb->mark); if (!cp) { @@ -491,9 +492,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), - IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), - IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), - IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), cp->flags, atomic_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); @@ -550,7 +551,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p); - cp = ip_vs_conn_new(&p, &daddr, 0, + cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, NULL, skb->mark); if (!cp) @@ -1906,7 +1907,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { { .hook = ip_vs_local_reply6, .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, }, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 581a6584ed0c..ac7ba689efe7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -574,8 +574,8 @@ bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol, * Called under RCU lock. */ static struct ip_vs_dest * -ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) +ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, + const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; @@ -583,9 +583,9 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * Find the destination for the given service */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { - if ((dest->af == svc->af) - && ip_vs_addr_equal(svc->af, &dest->addr, daddr) - && (dest->port == dport)) { + if ((dest->af == dest_af) && + ip_vs_addr_equal(dest_af, &dest->addr, daddr) && + (dest->port == dport)) { /* HIT */ return dest; } @@ -602,7 +602,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * on the backup. * Called under RCU lock, no refcnt is returned. */ -struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, +struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, @@ -613,14 +613,14 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, struct ip_vs_service *svc; __be16 port = dport; - svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport); + svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) port = 0; - dest = ip_vs_lookup_dest(svc, daddr, port); + dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); if (!dest) - dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); + dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); return dest; } @@ -657,8 +657,8 @@ static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) * scheduling. */ static struct ip_vs_dest * -ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, - __be16 dport) +ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, + const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; struct netns_ipvs *ipvs = net_ipvs(svc->net); @@ -671,11 +671,11 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); - if (dest->af == svc->af && - ip_vs_addr_equal(svc->af, &dest->addr, daddr) && + if (dest->af == dest_af && + ip_vs_addr_equal(dest_af, &dest->addr, daddr) && dest->port == dport && dest->vfwmark == svc->fwmark && dest->protocol == svc->protocol && @@ -779,6 +779,12 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_scheduler *sched; int conn_flags; + /* We cannot modify an address and change the address family */ + BUG_ON(!add && udest->af != dest->af); + + if (add && udest->af != svc->af) + ipvs->mixed_address_family_dests++; + /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; @@ -816,6 +822,8 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; + dest->af = udest->af; + spin_lock_bh(&dest->dst_lock); __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); @@ -847,7 +855,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, EnterFunction(2); #ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { + if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && @@ -875,12 +883,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, u64_stats_init(&ip_vs_dest_stats->syncp); } - dest->af = svc->af; + dest->af = udest->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; dest->vport = svc->port; dest->vfwmark = svc->fwmark; - ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); + ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); dest->port = udest->port; atomic_set(&dest->activeconns, 0); @@ -928,11 +936,11 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &daddr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest != NULL) { @@ -944,12 +952,12 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) * Check if the dest already exists in the trash and * is from the same service */ - dest = ip_vs_trash_get_dest(svc, &daddr, dport); + dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); if (dest != NULL) { IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " "dest->refcnt=%d, service %u/%s:%u\n", - IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), atomic_read(&dest->refcnt), dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->vaddr), @@ -992,11 +1000,11 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &daddr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest == NULL) { @@ -1055,6 +1063,9 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, list_del_rcu(&dest->n_list); svc->num_dests--; + if (dest->af != svc->af) + net_ipvs(svc->net)->mixed_address_family_dests--; + if (svcupd) { struct ip_vs_scheduler *sched; @@ -1078,7 +1089,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* We use function that requires RCU lock */ rcu_read_lock(); - dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); rcu_read_unlock(); if (dest == NULL) { @@ -1807,92 +1818,6 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec, }, #endif -#if 0 - { - .procname = "timeout_established", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_synsent", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_synrecv", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_finwait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_timewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_close", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_closewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_lastack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_listen", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_synack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_udp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "timeout_icmp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, -#endif { } }; @@ -2265,29 +2190,41 @@ static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) return 0; } +#define CMDID(cmd) (cmd - IP_VS_BASE_CTL) + +struct ip_vs_svcdest_user { + struct ip_vs_service_user s; + struct ip_vs_dest_user d; +}; -#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) -#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ - sizeof(struct ip_vs_dest_user)) -#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) -#define MAX_ARG_LEN SVCDEST_ARG_LEN - -static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { - [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, - [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, +static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { + [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), + [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), + [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), + [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), + [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), + [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), + [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), + [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), + [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), + [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), }; +union ip_vs_set_arglen { + struct ip_vs_service_user field_IP_VS_SO_SET_ADD; + struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; + struct ip_vs_service_user field_IP_VS_SO_SET_DEL; + struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; + struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; + struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; + struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; + struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; + struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; + struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; +}; + +#define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) + static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, struct ip_vs_service_user *usvc_compat) { @@ -2318,6 +2255,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, udest->weight = udest_compat->weight; udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; + udest->af = AF_INET; } static int @@ -2325,7 +2263,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { struct net *net = sock_net(sk); int ret; - unsigned char arg[MAX_ARG_LEN]; + unsigned char arg[MAX_SET_ARGLEN]; struct ip_vs_service_user *usvc_compat; struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; @@ -2333,16 +2271,15 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) struct ip_vs_dest_user_kern udest; struct netns_ipvs *ipvs = net_ipvs(net); + BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) return -EINVAL; - if (len < 0 || len > MAX_ARG_LEN) - return -EINVAL; - if (len != set_arglen[SET_CMDID(cmd)]) { - pr_err("set_ctl: len %u != %u\n", - len, set_arglen[SET_CMDID(cmd)]); + if (len != set_arglen[CMDID(cmd)]) { + IP_VS_DBG(1, "set_ctl: len %u != %u\n", + len, set_arglen[CMDID(cmd)]); return -EINVAL; } @@ -2357,10 +2294,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - if (mutex_lock_interruptible(&ipvs->sync_mutex)) { - ret = -ERESTARTSYS; - goto out_dec; - } + mutex_lock(&ipvs->sync_mutex); if (cmd == IP_VS_SO_SET_STARTDAEMON) ret = start_sync_thread(net, dm->state, dm->mcast_ifn, dm->syncid); @@ -2370,11 +2304,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) goto out_dec; } - if (mutex_lock_interruptible(&__ip_vs_mutex)) { - ret = -ERESTARTSYS; - goto out_dec; - } - + mutex_lock(&__ip_vs_mutex); if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ ret = ip_vs_flush(net, false); @@ -2562,6 +2492,12 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, if (count >= get->num_dests) break; + /* Cannot expose heterogeneous members via sockopt + * interface + */ + if (dest->af != svc->af) + continue; + entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); @@ -2605,51 +2541,51 @@ __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) #endif } +static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { + [CMDID(IP_VS_SO_GET_VERSION)] = 64, + [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), + [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), + [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), + [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), + [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), + [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), +}; -#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) -#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) -#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) -#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) -#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) - -static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { - [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, - [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, +union ip_vs_get_arglen { + char field_IP_VS_SO_GET_VERSION[64]; + struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; + struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; + struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; + struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; + struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; + struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; }; +#define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) + static int do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { - unsigned char arg[128]; + unsigned char arg[MAX_GET_ARGLEN]; int ret = 0; unsigned int copylen; struct net *net = sock_net(sk); struct netns_ipvs *ipvs = net_ipvs(net); BUG_ON(!net); + BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) return -EINVAL; - if (*len < get_arglen[GET_CMDID(cmd)]) { - pr_err("get_ctl: len %u < %u\n", - *len, get_arglen[GET_CMDID(cmd)]); + copylen = get_arglen[CMDID(cmd)]; + if (*len < (int) copylen) { + IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); return -EINVAL; } - copylen = get_arglen[GET_CMDID(cmd)]; - if (copylen > 128) - return -EINVAL; - if (copy_from_user(arg, user, copylen) != 0) return -EFAULT; /* @@ -2659,9 +2595,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) struct ip_vs_daemon_user d[2]; memset(&d, 0, sizeof(d)); - if (mutex_lock_interruptible(&ipvs->sync_mutex)) - return -ERESTARTSYS; - + mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, @@ -2680,9 +2614,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } - if (mutex_lock_interruptible(&__ip_vs_mutex)) - return -ERESTARTSYS; - + mutex_lock(&__ip_vs_mutex); switch (cmd) { case IP_VS_SO_GET_VERSION: { @@ -2863,6 +2795,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, + [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3118,7 +3051,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, atomic_read(&dest->inactconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns))) + atomic_read(&dest->persistconns)) || + nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) goto nla_put_failure; @@ -3199,6 +3133,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, { struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; struct nlattr *nla_addr, *nla_port; + struct nlattr *nla_addr_family; /* Parse mandatory identifying destination fields first */ if (nla == NULL || @@ -3207,6 +3142,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; nla_port = attrs[IPVS_DEST_ATTR_PORT]; + nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; if (!(nla_addr && nla_port)) return -EINVAL; @@ -3216,6 +3152,11 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); + if (nla_addr_family) + udest->af = nla_get_u16(nla_addr_family); + else + udest->af = 0; + /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, @@ -3320,6 +3261,12 @@ static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; + /* The synchronization protocol is incompatible with mixed family + * services + */ + if (net_ipvs(net)->mixed_address_family_dests > 0) + return -EINVAL; + return start_sync_thread(net, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), @@ -3443,6 +3390,35 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) need_full_dest); if (ret) goto out; + + /* Old protocols did not allow the user to specify address + * family, so we set it to zero instead. We also didn't + * allow heterogeneous pools in the old code, so it's safe + * to assume that this will have the same address family as + * the service. + */ + if (udest.af == 0) + udest.af = svc->af; + + if (udest.af != svc->af) { + /* The synchronization protocol is incompatible + * with mixed family services + */ + if (net_ipvs(net)->sync_state) { + ret = -EINVAL; + goto out; + } + + /* Which connection types do we support? */ + switch (udest.conn_flags) { + case IP_VS_CONN_F_TUNNEL: + /* We are able to forward this */ + break; + default: + ret = -EINVAL; + goto out; + } + } } switch (cmd) { diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index c3b84546ea9e..6be5c538b71e 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -234,7 +234,7 @@ ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c new file mode 100644 index 000000000000..e09874d02938 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -0,0 +1,79 @@ +/* + * IPVS: Weighted Fail Over module + * + * Authors: Kenny Mathis <kmathis@chokepoint.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Kenny Mathis : added initial functionality based on weight + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + +/* Weighted Fail Over Module */ +static struct ip_vs_dest * +ip_vs_fo_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *hweight = NULL; + int hw = 0; /* Track highest weight */ + + IP_VS_DBG(6, "ip_vs_fo_schedule(): Scheduling...\n"); + + /* Basic failover functionality + * Find virtual server with highest weight and send it traffic + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > hw) { + hweight = dest; + hw = atomic_read(&dest->weight); + } + } + + if (hweight) { + IP_VS_DBG_BUF(6, "FO: server %s:%u activeconns %d weight %d\n", + IP_VS_DBG_ADDR(hweight->af, &hweight->addr), + ntohs(hweight->port), + atomic_read(&hweight->activeconns), + atomic_read(&hweight->weight)); + return hweight; + } + + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; +} + +static struct ip_vs_scheduler ip_vs_fo_scheduler = { + .name = "fo", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_fo_scheduler.n_list), + .schedule = ip_vs_fo_schedule, +}; + +static int __init ip_vs_fo_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_fo_scheduler); +} + +static void __exit ip_vs_fo_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_fo_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_fo_init); +module_exit(ip_vs_fo_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 77c173282f38..a64fa15790e5 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -233,7 +233,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, IPPROTO_TCP, &cp->caddr, 0, &cp->vaddr, port, &p); - n_cp = ip_vs_conn_new(&p, &from, port, + /* As above, this is ipv4 only */ + n_cp = ip_vs_conn_new(&p, AF_INET, &from, port, IP_VS_CONN_F_NO_CPORT | IP_VS_CONN_F_NFCT, cp->dest, skb->mark); @@ -396,7 +397,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { - n_cp = ip_vs_conn_new(&p, &cp->daddr, + /* This is ipv4 only */ + n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr, htons(ntohs(cp->dport)-1), IP_VS_CONN_F_NFCT, cp->dest, skb->mark); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 547ff33c1efd..127f14046c51 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -199,11 +199,11 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, - struct ip_vs_dest *dest) + u16 af, struct ip_vs_dest *dest) { struct ip_vs_lblc_entry *en; - en = ip_vs_lblc_get(dest->af, tbl, daddr); + en = ip_vs_lblc_get(af, tbl, daddr); if (en) { if (en->dest == dest) return en; @@ -213,8 +213,8 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, if (!en) return NULL; - en->af = dest->af; - ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->af = af; + ip_vs_addr_copy(af, &en->addr, daddr); en->lastuse = jiffies; ip_vs_dest_hold(dest); @@ -521,13 +521,13 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, /* If we fail to create a cache entry, we'll just use the valid dest */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) - ip_vs_lblc_new(tbl, &iph->daddr, dest); + ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest); spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 3f21a2f47de1..2229d2d8bbe0 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -362,18 +362,18 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, - struct ip_vs_dest *dest) + u16 af, struct ip_vs_dest *dest) { struct ip_vs_lblcr_entry *en; - en = ip_vs_lblcr_get(dest->af, tbl, daddr); + en = ip_vs_lblcr_get(af, tbl, daddr); if (!en) { en = kmalloc(sizeof(*en), GFP_ATOMIC); if (!en) return NULL; - en->af = dest->af; - ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->af = af; + ip_vs_addr_copy(af, &en->addr, daddr); en->lastuse = jiffies; /* initialize its dest set */ @@ -706,13 +706,13 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, /* If we fail to create a cache entry, we'll just use the valid dest */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) - ip_vs_lblcr_new(tbl, &iph->daddr, dest); + ip_vs_lblcr_new(tbl, &iph->daddr, svc->af, dest); spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index 2bdcb1cf2127..19a0769a989a 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -59,7 +59,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, else IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d " "inactconns %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), + IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->inactconns)); diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index 961a6de9bb29..a8b63401e773 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -107,7 +107,8 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, out: IP_VS_DBG_BUF(6, "NQ: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 2f7ea7564044..5b84c0b56642 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -432,7 +432,7 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, pd->pp->name, ((direction == IP_VS_DIR_OUTPUT) ? "output " : "input "), - IP_VS_DBG_ADDR(cp->af, &cp->daddr), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index e3a697234a98..8e92beb0cca9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -510,7 +510,7 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', - IP_VS_DBG_ADDR(cp->af, &cp->daddr), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index 176b87c35e34..58bacfc461ee 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -95,7 +95,7 @@ stop: spin_unlock_bh(&svc->sched_lock); IP_VS_DBG_BUF(6, "RR: server %s:%u " "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index e446b9fa7424..f8e2d00f528b 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -108,7 +108,8 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "SED: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index cc65b2f42cd4..98a13433b68c 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -138,7 +138,7 @@ ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, return dest; IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); /* if the original dest is unavailable, loop around the table * starting from ihash to find a new dest @@ -153,7 +153,7 @@ ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, return dest; IP_VS_DBG_BUF(6, "SH: selected unavailable " "server %s:%d (offset %d), reselecting", - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), roffset); } @@ -192,7 +192,7 @@ ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) RCU_INIT_POINTER(b->dest, dest); IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", - i, IP_VS_DBG_ADDR(svc->af, &dest->addr), + i, IP_VS_DBG_ADDR(dest->af, &dest->addr), atomic_read(&dest->weight)); /* Don't move to next dest until filling weight */ @@ -342,7 +342,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->saddr), - IP_VS_DBG_ADDR(svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index db801263ee9f..7162c86fd50d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -880,14 +880,20 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, * but still handled. */ rcu_read_lock(); - dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, - param->vport, protocol, fwmark, flags); + /* This function is only invoked by the synchronization + * code. We do not currently support heterogeneous pools + * with synchronization, so we can make the assumption that + * the svc_af is the same as the dest_af + */ + dest = ip_vs_find_dest(net, type, type, daddr, dport, + param->vaddr, param->vport, protocol, + fwmark, flags); - cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); + cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, + fwmark); rcu_read_unlock(); if (!cp) { - if (param->pe_data) - kfree(param->pe_data); + kfree(param->pe_data); IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); return; } diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index b5b4650d50a9..6b366fd90554 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -80,7 +80,8 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_BUF(6, "WLC: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", - IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), atomic_read(&least->activeconns), atomic_read(&least->refcnt), atomic_read(&least->weight), loh); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 0546cd572d6b..17e6d4406ca7 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -216,7 +216,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, found: IP_VS_DBG_BUF(6, "WRR: server %s:%u " "activeconns %d refcnt %d weight %d\n", - IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), atomic_read(&dest->refcnt), atomic_read(&dest->weight)); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 73ba1cc7a88d..91f17c1eb8a2 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -38,6 +38,7 @@ #include <net/route.h> /* for ip_route_output */ #include <net/ipv6.h> #include <net/ip6_route.h> +#include <net/ip_tunnels.h> #include <net/addrconf.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> @@ -156,18 +157,113 @@ retry: return rt; } +#ifdef CONFIG_IP_VS_IPV6 +static inline int __ip_vs_is_local_route6(struct rt6_info *rt) +{ + return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; +} +#endif + +static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, + int rt_mode, + bool new_rt_is_local) +{ + bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); + bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); + bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); + bool source_is_loopback; + bool old_rt_is_local; + +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); + + source_is_loopback = + (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (addr_type & IPV6_ADDR_LOOPBACK); + old_rt_is_local = __ip_vs_is_local_route6( + (struct rt6_info *)skb_dst(skb)); + } else +#endif + { + source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr); + old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; + } + + if (unlikely(new_rt_is_local)) { + if (!rt_mode_allow_local) + return true; + if (!rt_mode_allow_redirect && !old_rt_is_local) + return true; + } else { + if (!rt_mode_allow_non_local) + return true; + if (source_is_loopback) + return true; + } + return false; +} + +static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) +{ + struct sock *sk = skb->sk; + struct rtable *ort = skb_rtable(skb); + + if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); +} + +static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode, + struct ip_vs_iphdr *ipvsh, + struct sk_buff *skb, int mtu) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + struct net *net = dev_net(skb_dst(skb)->dev); + + if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { + if (!skb->dev) + skb->dev = net->loopback_dev; + /* only send ICMP too big on first fragment */ + if (!ipvsh->fragoffs) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG(1, "frag needed for %pI6c\n", + &ipv6_hdr(skb)->saddr); + return false; + } + } else +#endif + { + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + + /* If we're going to tunnel the packet and pmtu discovery + * is disabled, we'll just fragment it anyway + */ + if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) + return true; + + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && + skb->len > mtu && !skb_is_gso(skb))) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + IP_VS_DBG(1, "frag needed for %pI4\n", + &ip_hdr(skb)->saddr); + return false; + } + } + + return true; +} + /* Get route to destination or remote server */ static int -__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, - __be32 daddr, int rt_mode, __be32 *ret_saddr) +__ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, + __be32 daddr, int rt_mode, __be32 *ret_saddr, + struct ip_vs_iphdr *ipvsh) { struct net *net = dev_net(skb_dst(skb)->dev); - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ - struct rtable *ort; /* Original route */ - struct iphdr *iph; - __be16 df; int mtu; int local, noref = 1; @@ -217,30 +313,14 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, } local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; - if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & - rt_mode)) { - IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", - (rt->rt_flags & RTCF_LOCAL) ? - "local":"non-local", &daddr); + if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, + local))) { + IP_VS_DBG_RL("We are crossing local and non-local addresses" + " daddr=%pI4\n", &dest->addr.ip); goto err_put; } - iph = ip_hdr(skb); - if (likely(!local)) { - if (unlikely(ipv4_is_loopback(iph->saddr))) { - IP_VS_DBG_RL("Stopping traffic from loopback address " - "%pI4 to non-local address, dest: %pI4\n", - &iph->saddr, &daddr); - goto err_put; - } - } else { - ort = skb_rtable(skb); - if (!(rt_mode & IP_VS_RT_MODE_RDR) && - !(ort->rt_flags & RTCF_LOCAL)) { - IP_VS_DBG_RL("Redirect from non-local address %pI4 to " - "local requires NAT method, dest: %pI4\n", - &iph->daddr, &daddr); - goto err_put; - } + + if (unlikely(local)) { /* skb to local stack, preserve old route */ if (!noref) ip_rt_put(rt); @@ -249,28 +329,17 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { mtu = dst_mtu(&rt->dst); - df = iph->frag_off & htons(IP_DF); } else { - struct sock *sk = skb->sk; - mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; } - ort = skb_rtable(skb); - if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) - ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); - /* MTU check allowed? */ - df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; + maybe_update_pmtu(skb_af, skb, mtu); } - /* MTU checking */ - if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr); + if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; - } skb_dst_drop(skb); if (noref) { @@ -294,12 +363,6 @@ err_unreach: } #ifdef CONFIG_IP_VS_IPV6 - -static inline int __ip_vs_is_local_route6(struct rt6_info *rt) -{ - return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; -} - static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, struct in6_addr *ret_saddr, int do_xfrm) @@ -338,14 +401,13 @@ out_err: * Get route to destination or remote server */ static int -__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, +__ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) { struct net *net = dev_net(skb_dst(skb)->dev); struct ip_vs_dest_dst *dest_dst; struct rt6_info *rt; /* Route to the other host */ - struct rt6_info *ort; /* Original route */ struct dst_entry *dst; int mtu; int local, noref = 1; @@ -392,32 +454,15 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, } local = __ip_vs_is_local_route6(rt); - if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & - rt_mode)) { - IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n", - local ? "local":"non-local", daddr); + + if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, + local))) { + IP_VS_DBG_RL("We are crossing local and non-local addresses" + " daddr=%pI6\n", &dest->addr.in6); goto err_put; } - if (likely(!local)) { - if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&ipv6_hdr(skb)->saddr) & - IPV6_ADDR_LOOPBACK)) { - IP_VS_DBG_RL("Stopping traffic from loopback address " - "%pI6c to non-local address, " - "dest: %pI6c\n", - &ipv6_hdr(skb)->saddr, daddr); - goto err_put; - } - } else { - ort = (struct rt6_info *) skb_dst(skb); - if (!(rt_mode & IP_VS_RT_MODE_RDR) && - !__ip_vs_is_local_route6(ort)) { - IP_VS_DBG_RL("Redirect from non-local address %pI6c " - "to local requires NAT method, " - "dest: %pI6c\n", - &ipv6_hdr(skb)->daddr, daddr); - goto err_put; - } + + if (unlikely(local)) { /* skb to local stack, preserve old route */ if (!noref) dst_release(&rt->dst); @@ -428,28 +473,17 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) mtu = dst_mtu(&rt->dst); else { - struct sock *sk = skb->sk; - mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); goto err_put; } - ort = (struct rt6_info *) skb_dst(skb); - if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) - ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); + maybe_update_pmtu(skb_af, skb, mtu); } - if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { - if (!skb->dev) - skb->dev = net->loopback_dev; - /* only send ICMP too big on first fragment */ - if (!ipvsh->fragoffs) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr); + if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu)) goto err_put; - } skb_dst_drop(skb); if (noref) { @@ -555,8 +589,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, - NULL) < 0) + if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr, + IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) goto tx_error; ip_send_check(iph); @@ -585,7 +619,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL, + if (__ip_vs_get_out_rt_v6(cp->af, skb, NULL, &ipvsh->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) goto tx_error; @@ -632,10 +666,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } was_input = rt_is_input_route(skb_rtable(skb)); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_RDR, NULL); + IP_VS_RT_MODE_RDR, NULL, ipvsh); if (local < 0) goto tx_error; rt = skb_rtable(skb); @@ -720,8 +754,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - ipvsh, 0, + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR); @@ -790,6 +824,81 @@ tx_error: } #endif +/* When forwarding a packet, we must ensure that we've got enough headroom + * for the encapsulation packet in the skb. This also gives us an + * opportunity to figure out what the payload_len, dsfield, ttl, and df + * values should be, so that we won't need to look at the old ip header + * again + */ +static struct sk_buff * +ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, + unsigned int max_headroom, __u8 *next_protocol, + __u32 *payload_len, __u8 *dsfield, __u8 *ttl, + __be16 *df) +{ + struct sk_buff *new_skb = NULL; + struct iphdr *old_iph = NULL; +#ifdef CONFIG_IP_VS_IPV6 + struct ipv6hdr *old_ipv6h = NULL; +#endif + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { + new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) + goto error; + consume_skb(skb); + skb = new_skb; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + old_ipv6h = ipv6_hdr(skb); + *next_protocol = IPPROTO_IPV6; + if (payload_len) + *payload_len = + ntohs(old_ipv6h->payload_len) + + sizeof(*old_ipv6h); + *dsfield = ipv6_get_dsfield(old_ipv6h); + *ttl = old_ipv6h->hop_limit; + if (df) + *df = 0; + } else +#endif + { + old_iph = ip_hdr(skb); + /* Copy DF, reset fragment offset and MF */ + if (df) + *df = (old_iph->frag_off & htons(IP_DF)); + *next_protocol = IPPROTO_IPIP; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + *dsfield = ipv4_get_dsfield(old_iph); + *ttl = old_iph->ttl; + if (payload_len) + *payload_len = ntohs(old_iph->tot_len); + } + + return skb; +error: + kfree_skb(skb); + return ERR_PTR(-ENOMEM); +} + +static inline int __tun_gso_type_mask(int encaps_af, int orig_af) +{ + if (encaps_af == AF_INET) { + if (orig_af == AF_INET) + return SKB_GSO_IPIP; + + return SKB_GSO_SIT; + } + + /* GSO: we need to provide proper SKB_GSO_ value for IPv6: + * SKB_GSO_SIT/IPV6 + */ + return 0; +} /* * IP Tunneling transmitter @@ -818,9 +927,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct rtable *rt; /* Route to the other host */ __be32 saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); - u8 tos = old_iph->tos; - __be16 df; + __u8 next_protocol = 0; + __u8 dsfield = 0; + __u8 ttl = 0; + __be16 df = 0; + __be16 *dfp = NULL; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; @@ -828,11 +939,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_CONNECT | - IP_VS_RT_MODE_TUNNEL, &saddr); + IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); if (local < 0) goto tx_error; if (local) { @@ -843,30 +954,26 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rt = skb_rtable(skb); tdev = rt->dst.dev; - /* Copy DF, reset fragment offset and MF */ - df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0; - /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); + /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ + dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; + skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, + &next_protocol, NULL, &dsfield, + &ttl, dfp); + if (IS_ERR(skb)) + goto tx_error; - if (!new_skb) - goto tx_error; - consume_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); - } + skb = iptunnel_handle_offloads( + skb, false, __tun_gso_type_mask(AF_INET, cp->af)); + if (IS_ERR(skb)) + goto tx_error; skb->transport_header = skb->network_header; - /* fix old IP header checksum */ - ip_send_check(old_iph); - skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -878,11 +985,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; - iph->protocol = IPPROTO_IPIP; - iph->tos = tos; + iph->protocol = next_protocol; + iph->tos = dsfield; iph->daddr = cp->daddr.ip; iph->saddr = saddr; - iph->ttl = old_iph->ttl; + iph->ttl = ttl; ip_select_ident(skb, NULL); /* Another hack: avoid icmp_send in ip_fragment */ @@ -900,7 +1007,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, return NF_STOLEN; tx_error: - kfree_skb(skb); + if (!IS_ERR(skb)) + kfree_skb(skb); rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; @@ -914,7 +1022,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct rt6_info *rt; /* Route to the other host */ struct in6_addr saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ - struct ipv6hdr *old_iph = ipv6_hdr(skb); + __u8 next_protocol = 0; + __u32 payload_len = 0; + __u8 dsfield = 0; + __u8 ttl = 0; struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; @@ -922,7 +1033,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, &saddr, ipvsh, 1, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | @@ -942,16 +1053,16 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); - if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); + skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, + &next_protocol, &payload_len, + &dsfield, &ttl, NULL); + if (IS_ERR(skb)) + goto tx_error; - if (!new_skb) - goto tx_error; - consume_skb(skb); - skb = new_skb; - old_iph = ipv6_hdr(skb); - } + skb = iptunnel_handle_offloads( + skb, false, __tun_gso_type_mask(AF_INET6, cp->af)); + if (IS_ERR(skb)) + goto tx_error; skb->transport_header = skb->network_header; @@ -964,14 +1075,13 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ iph = ipv6_hdr(skb); iph->version = 6; - iph->nexthdr = IPPROTO_IPV6; - iph->payload_len = old_iph->payload_len; - be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); - iph->priority = old_iph->priority; + iph->nexthdr = next_protocol; + iph->payload_len = htons(payload_len); memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); + ipv6_change_dsfield(iph, 0, dsfield); iph->daddr = cp->daddr.in6; iph->saddr = saddr; - iph->hop_limit = old_iph->hop_limit; + iph->hop_limit = ttl; /* Another hack: avoid icmp_send in ip_fragment */ skb->ignore_df = 1; @@ -988,7 +1098,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, return NF_STOLEN; tx_error: - kfree_skb(skb); + if (!IS_ERR(skb)) + kfree_skb(skb); rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; @@ -1009,10 +1120,10 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | - IP_VS_RT_MODE_KNOWN_NH, NULL); + IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); if (local < 0) goto tx_error; if (local) { @@ -1048,8 +1159,8 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - ipvsh, 0, + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL); if (local < 0) @@ -1116,7 +1227,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL); + local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, + NULL, iph); if (local < 0) goto tx_error; rt = skb_rtable(skb); @@ -1207,8 +1319,8 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; rcu_read_lock(); - local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, - ipvsh, 0, rt_mode); + local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, + NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; rt = (struct rt6_info *) skb_dst(skb); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1f4f954c4b47..5016a6929085 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -142,7 +142,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) static u32 __hash_bucket(u32 hash, unsigned int size) { - return ((u64)hash * size) >> 32; + return reciprocal_scale(hash, size); } static u32 hash_bucket(u32 hash, const struct net *net) @@ -352,57 +352,28 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) local_bh_enable(); } -static void death_by_event(unsigned long ul_conntrack) -{ - struct nf_conn *ct = (void *)ul_conntrack; - struct net *net = nf_ct_net(ct); - struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); - - BUG_ON(ecache == NULL); - - if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { - /* bad luck, let's retry again */ - ecache->timeout.expires = jiffies + - (prandom_u32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ecache->timeout); - return; - } - /* we've got the event delivered, now it's dying */ - set_bit(IPS_DYING_BIT, &ct->status); - nf_ct_put(ct); -} - -static void nf_ct_dying_timeout(struct nf_conn *ct) -{ - struct net *net = nf_ct_net(ct); - struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); - - BUG_ON(ecache == NULL); - - /* set a new timer to retry event delivery */ - setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); - ecache->timeout.expires = jiffies + - (prandom_u32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ecache->timeout); -} - bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; tstamp = nf_conn_tstamp_find(ct); if (tstamp && tstamp->stop == 0) - tstamp->stop = ktime_to_ns(ktime_get_real()); + tstamp->stop = ktime_get_real_ns(); + + if (nf_ct_is_dying(ct)) + goto delete; - if (!nf_ct_is_dying(ct) && - unlikely(nf_conntrack_event_report(IPCT_DESTROY, ct, - portid, report) < 0)) { + if (nf_conntrack_event_report(IPCT_DESTROY, ct, + portid, report) < 0) { /* destroy event was not delivered */ nf_ct_delete_from_lists(ct); - nf_ct_dying_timeout(ct); + nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); return false; } + + nf_conntrack_ecache_work(nf_ct_net(ct)); set_bit(IPS_DYING_BIT, &ct->status); + delete: nf_ct_delete_from_lists(ct); nf_ct_put(ct); return true; @@ -1464,26 +1435,6 @@ void nf_conntrack_flush_report(struct net *net, u32 portid, int report) } EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); -static void nf_ct_release_dying_list(struct net *net) -{ - struct nf_conntrack_tuple_hash *h; - struct nf_conn *ct; - struct hlist_nulls_node *n; - int cpu; - - for_each_possible_cpu(cpu) { - struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - - spin_lock_bh(&pcpu->lock); - hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - /* never fails to remove them, no listeners at this point */ - nf_ct_kill(ct); - } - spin_unlock_bh(&pcpu->lock); - } -} - static int untrack_refs(void) { int cnt = 0, cpu; @@ -1548,7 +1499,6 @@ i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0); - nf_ct_release_dying_list(net); if (atomic_read(&net->ct.count) != 0) busy = 1; } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 1df176146567..4e78c57b818f 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -29,6 +29,90 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); +#define ECACHE_RETRY_WAIT (HZ/10) + +enum retry_state { + STATE_CONGESTED, + STATE_RESTART, + STATE_DONE, +}; + +static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) +{ + struct nf_conn *refs[16]; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + unsigned int evicted = 0; + enum retry_state ret = STATE_DONE; + + spin_lock(&pcpu->lock); + + hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + if (nf_ct_is_dying(ct)) + continue; + + if (nf_conntrack_event(IPCT_DESTROY, ct)) { + ret = STATE_CONGESTED; + break; + } + + /* we've got the event delivered, now it's dying */ + set_bit(IPS_DYING_BIT, &ct->status); + refs[evicted] = ct; + + if (++evicted >= ARRAY_SIZE(refs)) { + ret = STATE_RESTART; + break; + } + } + + spin_unlock(&pcpu->lock); + + /* can't _put while holding lock */ + while (evicted) + nf_ct_put(refs[--evicted]); + + return ret; +} + +static void ecache_work(struct work_struct *work) +{ + struct netns_ct *ctnet = + container_of(work, struct netns_ct, ecache_dwork.work); + int cpu, delay = -1; + struct ct_pcpu *pcpu; + + local_bh_disable(); + + for_each_possible_cpu(cpu) { + enum retry_state ret; + + pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu); + + ret = ecache_work_evict_list(pcpu); + + switch (ret) { + case STATE_CONGESTED: + delay = ECACHE_RETRY_WAIT; + goto out; + case STATE_RESTART: + delay = 0; + break; + case STATE_DONE: + break; + } + } + + out: + local_bh_enable(); + + ctnet->ecache_dwork_pending = delay > 0; + if (delay >= 0) + schedule_delayed_work(&ctnet->ecache_dwork, delay); +} + /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) @@ -157,7 +241,6 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); #define NF_CT_EVENTS_DEFAULT 1 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; -static int nf_ct_events_retry_timeout __read_mostly = 15*HZ; #ifdef CONFIG_SYSCTL static struct ctl_table event_sysctl_table[] = { @@ -168,13 +251,6 @@ static struct ctl_table event_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "nf_conntrack_events_retry_timeout", - .data = &init_net.ct.sysctl_events_retry_timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, {} }; #endif /* CONFIG_SYSCTL */ @@ -196,7 +272,6 @@ static int nf_conntrack_event_init_sysctl(struct net *net) goto out; table[0].data = &net->ct.sysctl_events; - table[1].data = &net->ct.sysctl_events_retry_timeout; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) @@ -238,12 +313,13 @@ static void nf_conntrack_event_fini_sysctl(struct net *net) int nf_conntrack_ecache_pernet_init(struct net *net) { net->ct.sysctl_events = nf_ct_events; - net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout; + INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work); return nf_conntrack_event_init_sysctl(net); } void nf_conntrack_ecache_pernet_fini(struct net *net) { + cancel_delayed_work_sync(&net->ct.ecache_dwork); nf_conntrack_event_fini_sysctl(net); } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index f87e8f68ad45..91a1837acd0e 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -83,7 +83,8 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd); - return ((u64)hash * nf_ct_expect_hsize) >> 32; + + return reciprocal_scale(hash, nf_ct_expect_hsize); } struct nf_conntrack_expect * diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 300ed1eec729..1bd9ed9e62f6 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -745,8 +745,7 @@ static int ctnetlink_done(struct netlink_callback *cb) { if (cb->args[1]) nf_ct_put((struct nf_conn *)cb->args[1]); - if (cb->data) - kfree(cb->data); + kfree(cb->data); return 0; } @@ -1738,7 +1737,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, } tstamp = nf_conn_tstamp_find(ct); if (tstamp) - tstamp->start = ktime_to_ns(ktime_get_real()); + tstamp->start = ktime_get_real_ns(); err = nf_conntrack_hash_check_insert(ct); if (err < 0) diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index d25f29377648..957c1db66652 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -14,6 +14,30 @@ static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ; +static bool nf_generic_should_process(u8 proto) +{ + switch (proto) { +#ifdef CONFIG_NF_CT_PROTO_SCTP_MODULE + case IPPROTO_SCTP: + return false; +#endif +#ifdef CONFIG_NF_CT_PROTO_DCCP_MODULE + case IPPROTO_DCCP: + return false; +#endif +#ifdef CONFIG_NF_CT_PROTO_GRE_MODULE + case IPPROTO_GRE: + return false; +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE_MODULE + case IPPROTO_UDPLITE: + return false; +#endif + default: + return true; + } +} + static inline struct nf_generic_net *generic_pernet(struct net *net) { return &net->ct.nf_ct_proto.generic; @@ -67,7 +91,7 @@ static int generic_packet(struct nf_conn *ct, static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { - return true; + return nf_generic_should_process(nf_ct_protonum(ct)); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index f641751dba9d..cf65a1e040dd 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -101,7 +101,7 @@ static void *ct_seq_start(struct seq_file *seq, loff_t *pos) { struct ct_iter_state *st = seq->private; - st->time_now = ktime_to_ns(ktime_get_real()); + st->time_now = ktime_get_real_ns(); rcu_read_lock(); return ct_get_idx(seq, *pos); } diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 85296d4eac0e..daad6022c689 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -16,16 +16,22 @@ #define NF_LOG_PREFIXLEN 128 #define NFLOGGER_NAME_LEN 64 -static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; +static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); static struct nf_logger *__find_logger(int pf, const char *str_logger) { - struct nf_logger *t; + struct nf_logger *log; + int i; + + for (i = 0; i < NF_LOG_TYPE_MAX; i++) { + if (loggers[pf][i] == NULL) + continue; - list_for_each_entry(t, &nf_loggers_l[pf], list[pf]) { - if (!strnicmp(str_logger, t->name, strlen(t->name))) - return t; + log = rcu_dereference_protected(loggers[pf][i], + lockdep_is_held(&nf_log_mutex)); + if (!strnicmp(str_logger, log->name, strlen(log->name))) + return log; } return NULL; @@ -73,17 +79,14 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger) if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers)) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(logger->list); i++) - INIT_LIST_HEAD(&logger->list[i]); - mutex_lock(&nf_log_mutex); if (pf == NFPROTO_UNSPEC) { for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) - list_add_tail(&(logger->list[i]), &(nf_loggers_l[i])); + rcu_assign_pointer(loggers[i][logger->type], logger); } else { /* register at end of list to honor first register win */ - list_add_tail(&logger->list[pf], &nf_loggers_l[pf]); + rcu_assign_pointer(loggers[pf][logger->type], logger); } mutex_unlock(&nf_log_mutex); @@ -98,7 +101,7 @@ void nf_log_unregister(struct nf_logger *logger) mutex_lock(&nf_log_mutex); for (i = 0; i < NFPROTO_NUMPROTO; i++) - list_del(&logger->list[i]); + RCU_INIT_POINTER(loggers[i][logger->type], NULL); mutex_unlock(&nf_log_mutex); } EXPORT_SYMBOL(nf_log_unregister); @@ -129,6 +132,48 @@ void nf_log_unbind_pf(struct net *net, u_int8_t pf) } EXPORT_SYMBOL(nf_log_unbind_pf); +void nf_logger_request_module(int pf, enum nf_log_type type) +{ + if (loggers[pf][type] == NULL) + request_module("nf-logger-%u-%u", pf, type); +} +EXPORT_SYMBOL_GPL(nf_logger_request_module); + +int nf_logger_find_get(int pf, enum nf_log_type type) +{ + struct nf_logger *logger; + int ret = -ENOENT; + + logger = loggers[pf][type]; + if (logger == NULL) + request_module("nf-logger-%u-%u", pf, type); + + rcu_read_lock(); + logger = rcu_dereference(loggers[pf][type]); + if (logger == NULL) + goto out; + + if (logger && try_module_get(logger->me)) + ret = 0; +out: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(nf_logger_find_get); + +void nf_logger_put(int pf, enum nf_log_type type) +{ + struct nf_logger *logger; + + BUG_ON(loggers[pf][type] == NULL); + + rcu_read_lock(); + logger = rcu_dereference(loggers[pf][type]); + module_put(logger->me); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_logger_put); + void nf_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, @@ -143,7 +188,11 @@ void nf_log_packet(struct net *net, const struct nf_logger *logger; rcu_read_lock(); - logger = rcu_dereference(net->nf.nf_loggers[pf]); + if (loginfo != NULL) + logger = rcu_dereference(loggers[pf][loginfo->type]); + else + logger = rcu_dereference(net->nf.nf_loggers[pf]); + if (logger) { va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); @@ -154,6 +203,63 @@ void nf_log_packet(struct net *net, } EXPORT_SYMBOL(nf_log_packet); +#define S_SIZE (1024 - (sizeof(unsigned int) + 1)) + +struct nf_log_buf { + unsigned int count; + char buf[S_SIZE + 1]; +}; +static struct nf_log_buf emergency, *emergency_ptr = &emergency; + +__printf(2, 3) int nf_log_buf_add(struct nf_log_buf *m, const char *f, ...) +{ + va_list args; + int len; + + if (likely(m->count < S_SIZE)) { + va_start(args, f); + len = vsnprintf(m->buf + m->count, S_SIZE - m->count, f, args); + va_end(args); + if (likely(m->count + len < S_SIZE)) { + m->count += len; + return 0; + } + } + m->count = S_SIZE; + printk_once(KERN_ERR KBUILD_MODNAME " please increase S_SIZE\n"); + return -1; +} +EXPORT_SYMBOL_GPL(nf_log_buf_add); + +struct nf_log_buf *nf_log_buf_open(void) +{ + struct nf_log_buf *m = kmalloc(sizeof(*m), GFP_ATOMIC); + + if (unlikely(!m)) { + local_bh_disable(); + do { + m = xchg(&emergency_ptr, NULL); + } while (!m); + } + m->count = 0; + return m; +} +EXPORT_SYMBOL_GPL(nf_log_buf_open); + +void nf_log_buf_close(struct nf_log_buf *m) +{ + m->buf[m->count] = 0; + printk("%s\n", m->buf); + + if (likely(m != &emergency)) + kfree(m); + else { + emergency_ptr = m; + local_bh_enable(); + } +} +EXPORT_SYMBOL_GPL(nf_log_buf_close); + #ifdef CONFIG_PROC_FS static void *seq_start(struct seq_file *seq, loff_t *pos) { @@ -188,8 +294,7 @@ static int seq_show(struct seq_file *s, void *v) { loff_t *pos = v; const struct nf_logger *logger; - struct nf_logger *t; - int ret; + int i, ret; struct net *net = seq_file_net(s); logger = rcu_dereference_protected(net->nf.nf_loggers[*pos], @@ -203,11 +308,16 @@ static int seq_show(struct seq_file *s, void *v) if (ret < 0) return ret; - list_for_each_entry(t, &nf_loggers_l[*pos], list[*pos]) { - ret = seq_printf(s, "%s", t->name); + for (i = 0; i < NF_LOG_TYPE_MAX; i++) { + if (loggers[*pos][i] == NULL) + continue; + + logger = rcu_dereference_protected(loggers[*pos][i], + lockdep_is_held(&nf_log_mutex)); + ret = seq_printf(s, "%s", logger->name); if (ret < 0) return ret; - if (&t->list[*pos] != nf_loggers_l[*pos].prev) { + if (i == 0 && loggers[*pos][i + 1] != NULL) { ret = seq_printf(s, ","); if (ret < 0) return ret; @@ -389,14 +499,5 @@ static struct pernet_operations nf_log_net_ops = { int __init netfilter_log_init(void) { - int i, ret; - - ret = register_pernet_subsys(&nf_log_net_ops); - if (ret < 0) - return ret; - - for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) - INIT_LIST_HEAD(&(nf_loggers_l[i])); - - return 0; + return register_pernet_subsys(&nf_log_net_ops); } diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c new file mode 100644 index 000000000000..a2233e77cf39 --- /dev/null +++ b/net/netfilter/nf_log_common.c @@ -0,0 +1,187 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/xt_LOG.h> +#include <net/netfilter/nf_log.h> + +int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb, + u8 proto, int fragment, unsigned int offset) +{ + struct udphdr _udph; + const struct udphdr *uh; + + if (proto == IPPROTO_UDP) + /* Max length: 10 "PROTO=UDP " */ + nf_log_buf_add(m, "PROTO=UDP "); + else /* Max length: 14 "PROTO=UDPLITE " */ + nf_log_buf_add(m, "PROTO=UDPLITE "); + + if (fragment) + goto out; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (uh == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); + +out: + return 0; +} +EXPORT_SYMBOL_GPL(nf_log_dump_udp_header); + +int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, + u8 proto, int fragment, unsigned int offset, + unsigned int logflags) +{ + struct tcphdr _tcph; + const struct tcphdr *th; + + /* Max length: 10 "PROTO=TCP " */ + nf_log_buf_add(m, "PROTO=TCP "); + + if (fragment) + return 0; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (th == NULL) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + nf_log_buf_add(m, "SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & XT_LOG_TCPSEQ) { + nf_log_buf_add(m, "SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + } + + /* Max length: 13 "WINDOW=65535 " */ + nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3C " */ + nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & + TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + nf_log_buf_add(m, "CWR "); + if (th->ece) + nf_log_buf_add(m, "ECE "); + if (th->urg) + nf_log_buf_add(m, "URG "); + if (th->ack) + nf_log_buf_add(m, "ACK "); + if (th->psh) + nf_log_buf_add(m, "PSH "); + if (th->rst) + nf_log_buf_add(m, "RST "); + if (th->syn) + nf_log_buf_add(m, "SYN "); + if (th->fin) + nf_log_buf_add(m, "FIN "); + /* Max length: 11 "URGP=65535 " */ + nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { + u_int8_t _opt[60 - sizeof(struct tcphdr)]; + const u_int8_t *op; + unsigned int i; + unsigned int optsize = th->doff*4 - sizeof(struct tcphdr); + + op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), + optsize, _opt); + if (op == NULL) { + nf_log_buf_add(m, "OPT (TRUNCATED)"); + return 1; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + nf_log_buf_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + nf_log_buf_add(m, "%02X", op[i]); + + nf_log_buf_add(m, ") "); + } + + return 0; +} +EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); + +void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk) +{ + if (!sk || sk->sk_state == TCP_TIME_WAIT) + return; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + const struct cred *cred = sk->sk_socket->file->f_cred; + nf_log_buf_add(m, "UID=%u GID=%u ", + from_kuid_munged(&init_user_ns, cred->fsuid), + from_kgid_munged(&init_user_ns, cred->fsgid)); + } + read_unlock_bh(&sk->sk_callback_lock); +} +EXPORT_SYMBOL_GPL(nf_log_dump_sk_uid_gid); + +void +nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, const char *prefix) +{ + nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", + '0' + loginfo->u.log.level, prefix, + in ? in->name : "", + out ? out->name : ""); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (skb->nf_bridge) { + const struct net_device *physindev; + const struct net_device *physoutdev; + + physindev = skb->nf_bridge->physindev; + if (physindev && in != physindev) + nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); + physoutdev = skb->nf_bridge->physoutdev; + if (physoutdev && out != physoutdev) + nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); + } +#endif +} +EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); + +static int __init nf_log_common_init(void) +{ + return 0; +} + +static void __exit nf_log_common_exit(void) {} + +module_init(nf_log_common_init); +module_exit(nf_log_common_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index a49907b1dabc..4e0b47831d43 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -126,7 +126,8 @@ hash_by_src(const struct net *net, u16 zone, /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd); - return ((u64)hash * net->ct.nat_htable_size) >> 32; + + return reciprocal_scale(hash, net->ct.nat_htable_size); } /* Is this tuple already taken? (not by us) */ @@ -274,7 +275,7 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, } var_ipp->all[i] = (__force __u32) - htonl(minip + (((u64)j * dist) >> 32)); + htonl(minip + reciprocal_scale(j, dist)); if (var_ipp->all[i] != range->max_addr.all[i]) full_range = true; @@ -710,7 +711,7 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { .flags = NF_CT_EXT_F_PREALLOC, }; -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c index 83a72a235cae..fbce552a796e 100644 --- a/net/netfilter/nf_nat_proto_common.c +++ b/net/netfilter/nf_nat_proto_common.c @@ -95,7 +95,7 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, } EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple); -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) { diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c index c8be2cdac0bf..b8067b53ff3a 100644 --- a/net/netfilter/nf_nat_proto_dccp.c +++ b/net/netfilter/nf_nat_proto_dccp.c @@ -78,7 +78,7 @@ static const struct nf_nat_l4proto nf_nat_l4proto_dccp = { .manip_pkt = dccp_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = dccp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c index 754536f2c674..cbc7ade1487b 100644 --- a/net/netfilter/nf_nat_proto_sctp.c +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -59,7 +59,7 @@ static const struct nf_nat_l4proto nf_nat_l4proto_sctp = { .manip_pkt = sctp_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = sctp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c index 83ec8a6e4c36..37f5505f4529 100644 --- a/net/netfilter/nf_nat_proto_tcp.c +++ b/net/netfilter/nf_nat_proto_tcp.c @@ -79,7 +79,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_tcp = { .manip_pkt = tcp_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = tcp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c index 7df613fb34a2..b0ede2f0d8bc 100644 --- a/net/netfilter/nf_nat_proto_udp.c +++ b/net/netfilter/nf_nat_proto_udp.c @@ -70,7 +70,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_udp = { .manip_pkt = udp_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = udp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c index 776a0d1317b1..368f14e01e75 100644 --- a/net/netfilter/nf_nat_proto_udplite.c +++ b/net/netfilter/nf_nat_proto_udplite.c @@ -69,7 +69,7 @@ static const struct nf_nat_l4proto nf_nat_l4proto_udplite = { .manip_pkt = udplite_manip_pkt, .in_range = nf_nat_l4proto_in_range, .unique_tuple = udplite_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif }; diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 5d24b1fdb593..4c8b68e5fa16 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -52,7 +52,7 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) dev_put(entry->indev); if (entry->outdev) dev_put(entry->outdev); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; @@ -77,7 +77,7 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) dev_hold(entry->indev); if (entry->outdev) dev_hold(entry->outdev); -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; struct net_device *physdev; diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c index f042ae521557..c68c1e58b362 100644 --- a/net/netfilter/nf_sockopt.c +++ b/net/netfilter/nf_sockopt.c @@ -26,9 +26,7 @@ int nf_register_sockopt(struct nf_sockopt_ops *reg) struct nf_sockopt_ops *ops; int ret = 0; - if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) - return -EINTR; - + mutex_lock(&nf_sockopt_mutex); list_for_each_entry(ops, &nf_sockopts, list) { if (ops->pf == reg->pf && (overlap(ops->set_optmin, ops->set_optmax, @@ -65,9 +63,7 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf, { struct nf_sockopt_ops *ops; - if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) - return ERR_PTR(-EINTR); - + mutex_lock(&nf_sockopt_mutex); list_for_each_entry(ops, &nf_sockopts, list) { if (ops->pf == pf) { if (!try_module_get(ops->owner)) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8746ff9a8357..556a0dfa4abc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -127,6 +127,204 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } +static void nf_tables_unregister_hooks(const struct nft_table *table, + const struct nft_chain *chain, + unsigned int hook_nops) +{ + if (!(table->flags & NFT_TABLE_F_DORMANT) && + chain->flags & NFT_BASE_CHAIN) + nf_unregister_hooks(nft_base_chain(chain)->ops, hook_nops); +} + +/* Internal table flags */ +#define NFT_TABLE_INACTIVE (1 << 15) + +static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWTABLE) + ctx->table->flags |= NFT_TABLE_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +} + +static int nft_deltable(struct nft_ctx *ctx) +{ + int err; + + err = nft_trans_table_add(ctx, NFT_MSG_DELTABLE); + if (err < 0) + return err; + + list_del_rcu(&ctx->table->list); + return err; +} + +static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWCHAIN) + ctx->chain->flags |= NFT_CHAIN_INACTIVE; + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + return 0; +} + +static int nft_delchain(struct nft_ctx *ctx) +{ + int err; + + err = nft_trans_chain_add(ctx, NFT_MSG_DELCHAIN); + if (err < 0) + return err; + + ctx->table->use--; + list_del_rcu(&ctx->chain->list); + + return err; +} + +static inline bool +nft_rule_is_active(struct net *net, const struct nft_rule *rule) +{ + return (rule->genmask & (1 << net->nft.gencursor)) == 0; +} + +static inline int gencursor_next(struct net *net) +{ + return net->nft.gencursor+1 == 1 ? 1 : 0; +} + +static inline int +nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) +{ + return (rule->genmask & (1 << gencursor_next(net))) == 0; +} + +static inline void +nft_rule_activate_next(struct net *net, struct nft_rule *rule) +{ + /* Now inactive, will be active in the future */ + rule->genmask = (1 << net->nft.gencursor); +} + +static inline void +nft_rule_deactivate_next(struct net *net, struct nft_rule *rule) +{ + rule->genmask = (1 << gencursor_next(net)); +} + +static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) +{ + rule->genmask = 0; +} + +static int +nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) +{ + /* You cannot delete the same rule twice */ + if (nft_rule_is_active_next(ctx->net, rule)) { + nft_rule_deactivate_next(ctx->net, rule); + ctx->chain->use--; + return 0; + } + return -ENOENT; +} + +static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, + struct nft_rule *rule) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); + if (trans == NULL) + return NULL; + + nft_trans_rule(trans) = rule; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return trans; +} + +static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule) +{ + struct nft_trans *trans; + int err; + + trans = nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule); + if (trans == NULL) + return -ENOMEM; + + err = nf_tables_delrule_deactivate(ctx, rule); + if (err < 0) { + nft_trans_destroy(trans); + return err; + } + + return 0; +} + +static int nft_delrule_by_chain(struct nft_ctx *ctx) +{ + struct nft_rule *rule; + int err; + + list_for_each_entry(rule, &ctx->chain->rules, list) { + err = nft_delrule(ctx, rule); + if (err < 0) + return err; + } + return 0; +} + +/* Internal set flag */ +#define NFT_SET_INACTIVE (1 << 15) + +static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, + struct nft_set *set) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { + nft_trans_set_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); + set->flags |= NFT_SET_INACTIVE; + } + nft_trans_set(trans) = set; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +} + +static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) +{ + int err; + + err = nft_trans_set_add(ctx, NFT_MSG_DELSET, set); + if (err < 0) + return err; + + list_del_rcu(&set->list); + ctx->table->use--; + + return err; +} + /* * Tables */ @@ -207,9 +405,9 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, }; -static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, - const struct nft_table *table) +static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -222,7 +420,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || @@ -250,8 +448,8 @@ static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; - err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table); + err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; @@ -290,7 +488,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_table_info(skb, + if (nf_tables_fill_table_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, @@ -309,9 +507,6 @@ done: return skb->len; } -/* Internal table flags */ -#define NFT_TABLE_INACTIVE (1 << 15) - static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -345,7 +540,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, family, table); if (err < 0) @@ -443,21 +638,6 @@ err: return ret; } -static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) -{ - struct nft_trans *trans; - - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table)); - if (trans == NULL) - return -ENOMEM; - - if (msg_type == NFT_MSG_NEWTABLE) - ctx->table->flags |= NFT_TABLE_INACTIVE; - - list_add_tail(&trans->list, &ctx->net->nft.commit_list); - return 0; -} - static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -527,6 +707,67 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, return 0; } +static int nft_flush_table(struct nft_ctx *ctx) +{ + int err; + struct nft_chain *chain, *nc; + struct nft_set *set, *ns; + + list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) { + ctx->chain = chain; + + err = nft_delrule_by_chain(ctx); + if (err < 0) + goto out; + + err = nft_delchain(ctx); + if (err < 0) + goto out; + } + + list_for_each_entry_safe(set, ns, &ctx->table->sets, list) { + if (set->flags & NFT_SET_ANONYMOUS && + !list_empty(&set->bindings)) + continue; + + err = nft_delset(ctx, set); + if (err < 0) + goto out; + } + + err = nft_deltable(ctx); +out: + return err; +} + +static int nft_flush(struct nft_ctx *ctx, int family) +{ + struct nft_af_info *afi; + struct nft_table *table, *nt; + const struct nlattr * const *nla = ctx->nla; + int err = 0; + + list_for_each_entry(afi, &ctx->net->nft.af_info, list) { + if (family != AF_UNSPEC && afi->family != family) + continue; + + ctx->afi = afi; + list_for_each_entry_safe(table, nt, &afi->tables, list) { + if (nla[NFTA_TABLE_NAME] && + nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0) + continue; + + ctx->table = table; + + err = nft_flush_table(ctx); + if (err < 0) + goto out; + } + } +out: + return err; +} + static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -535,9 +776,13 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, struct nft_af_info *afi; struct nft_table *table; struct net *net = sock_net(skb->sk); - int family = nfmsg->nfgen_family, err; + int family = nfmsg->nfgen_family; struct nft_ctx ctx; + nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla); + if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL) + return nft_flush(&ctx, family); + afi = nf_tables_afinfo_lookup(net, family, false); if (IS_ERR(afi)) return PTR_ERR(afi); @@ -547,16 +792,11 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(table); if (table->flags & NFT_TABLE_INACTIVE) return -ENOENT; - if (table->use > 0) - return -EBUSY; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); - err = nft_trans_table_add(&ctx, NFT_MSG_DELTABLE); - if (err < 0) - return err; + ctx.afi = afi; + ctx.table = table; - list_del_rcu(&table->list); - return 0; + return nft_flush_table(&ctx); } static void nf_tables_table_destroy(struct nft_ctx *ctx) @@ -674,9 +914,9 @@ nla_put_failure: return -ENOSPC; } -static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, - const struct nft_table *table, +static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table, const struct nft_chain *chain) { struct nlmsghdr *nlh; @@ -690,7 +930,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) goto nla_put_failure; @@ -748,8 +988,8 @@ static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; - err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table, + err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table, ctx->chain); if (err < 0) { kfree_skb(skb); @@ -791,7 +1031,8 @@ static int nf_tables_dump_chains(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid, + if (nf_tables_fill_chain_info(skb, net, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, NLM_F_MULTI, @@ -850,7 +1091,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, family, table, chain); if (err < 0) @@ -899,6 +1140,9 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) static void nft_chain_stats_replace(struct nft_base_chain *chain, struct nft_stats __percpu *newstats) { + if (newstats == NULL) + return; + if (chain->stats) { struct nft_stats __percpu *oldstats = nft_dereference(chain->stats); @@ -910,21 +1154,6 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain, rcu_assign_pointer(chain->stats, newstats); } -static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) -{ - struct nft_trans *trans; - - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); - if (trans == NULL) - return -ENOMEM; - - if (msg_type == NFT_MSG_NEWCHAIN) - ctx->chain->flags |= NFT_CHAIN_INACTIVE; - - list_add_tail(&trans->list, &ctx->net->nft.commit_list); - return 0; -} - static void nf_tables_chain_destroy(struct nft_chain *chain) { BUG_ON(chain->use > 0); @@ -1154,11 +1383,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, list_add_tail_rcu(&chain->list, &table->chains); return 0; err2: - if (!(table->flags & NFT_TABLE_F_DORMANT) && - chain->flags & NFT_BASE_CHAIN) { - nf_unregister_hooks(nft_base_chain(chain)->ops, - afi->nops); - } + nf_tables_unregister_hooks(table, chain, afi->nops); err1: nf_tables_chain_destroy(chain); return err; @@ -1175,7 +1400,6 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; - int err; afi = nf_tables_afinfo_lookup(net, family, false); if (IS_ERR(afi)) @@ -1196,13 +1420,8 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, return -EBUSY; nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); - err = nft_trans_chain_add(&ctx, NFT_MSG_DELCHAIN); - if (err < 0) - return err; - table->use--; - list_del_rcu(&chain->list); - return 0; + return nft_delchain(&ctx); } /* @@ -1429,8 +1648,9 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, }; -static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, +static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, + u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, const struct nft_rule *rule) @@ -1450,7 +1670,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) goto nla_put_failure; @@ -1506,8 +1726,8 @@ static int nf_tables_rule_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; - err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table, + err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table, ctx->chain, rule); if (err < 0) { kfree_skb(skb); @@ -1524,41 +1744,6 @@ err: return err; } -static inline bool -nft_rule_is_active(struct net *net, const struct nft_rule *rule) -{ - return (rule->genmask & (1 << net->nft.gencursor)) == 0; -} - -static inline int gencursor_next(struct net *net) -{ - return net->nft.gencursor+1 == 1 ? 1 : 0; -} - -static inline int -nft_rule_is_active_next(struct net *net, const struct nft_rule *rule) -{ - return (rule->genmask & (1 << gencursor_next(net))) == 0; -} - -static inline void -nft_rule_activate_next(struct net *net, struct nft_rule *rule) -{ - /* Now inactive, will be active in the future */ - rule->genmask = (1 << net->nft.gencursor); -} - -static inline void -nft_rule_disactivate_next(struct net *net, struct nft_rule *rule) -{ - rule->genmask = (1 << gencursor_next(net)); -} - -static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) -{ - rule->genmask = 0; -} - static int nf_tables_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) { @@ -1588,7 +1773,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid, + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, @@ -1654,7 +1839,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule); if (err < 0) @@ -1684,21 +1869,6 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, - struct nft_rule *rule) -{ - struct nft_trans *trans; - - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule)); - if (trans == NULL) - return NULL; - - nft_trans_rule(trans) = rule; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); - - return trans; -} - #define NFT_RULE_MAXEXPRS 128 static struct nft_expr_info *info; @@ -1820,7 +1990,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, err = -ENOMEM; goto err2; } - nft_rule_disactivate_next(net, old_rule); + nft_rule_deactivate_next(net, old_rule); chain->use--; list_add_tail_rcu(&rule->list, &old_rule->list); } else { @@ -1864,33 +2034,6 @@ err1: return err; } -static int -nf_tables_delrule_one(struct nft_ctx *ctx, struct nft_rule *rule) -{ - /* You cannot delete the same rule twice */ - if (nft_rule_is_active_next(ctx->net, rule)) { - if (nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule) == NULL) - return -ENOMEM; - nft_rule_disactivate_next(ctx->net, rule); - ctx->chain->use--; - return 0; - } - return -ENOENT; -} - -static int nf_table_delrule_by_chain(struct nft_ctx *ctx) -{ - struct nft_rule *rule; - int err; - - list_for_each_entry(rule, &ctx->chain->rules, list) { - err = nf_tables_delrule_one(ctx, rule); - if (err < 0) - return err; - } - return 0; -} - static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -1929,14 +2072,14 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, if (IS_ERR(rule)) return PTR_ERR(rule); - err = nf_tables_delrule_one(&ctx, rule); + err = nft_delrule(&ctx, rule); } else { - err = nf_table_delrule_by_chain(&ctx); + err = nft_delrule_by_chain(&ctx); } } else { list_for_each_entry(chain, &table->chains, list) { ctx.chain = chain; - err = nf_table_delrule_by_chain(&ctx); + err = nft_delrule_by_chain(&ctx); if (err < 0) break; } @@ -2180,7 +2323,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx->afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; @@ -2201,6 +2344,11 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + if (set->policy != NFT_SET_POL_PERFORMANCE) { + if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy))) + goto nla_put_failure; + } + desc = nla_nest_start(skb, NFTA_SET_DESC); if (desc == NULL) goto nla_put_failure; @@ -2247,80 +2395,7 @@ err: return err; } -static int nf_tables_dump_sets_table(struct nft_ctx *ctx, struct sk_buff *skb, - struct netlink_callback *cb) -{ - const struct nft_set *set; - unsigned int idx = 0, s_idx = cb->args[0]; - - if (cb->args[1]) - return skb->len; - - rcu_read_lock(); - cb->seq = ctx->net->nft.base_seq; - - list_for_each_entry_rcu(set, &ctx->table->sets, list) { - if (idx < s_idx) - goto cont; - if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, - NLM_F_MULTI) < 0) { - cb->args[0] = idx; - goto done; - } - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } - cb->args[1] = 1; -done: - rcu_read_unlock(); - return skb->len; -} - -static int nf_tables_dump_sets_family(struct nft_ctx *ctx, struct sk_buff *skb, - struct netlink_callback *cb) -{ - const struct nft_set *set; - unsigned int idx, s_idx = cb->args[0]; - struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; - - if (cb->args[1]) - return skb->len; - - rcu_read_lock(); - cb->seq = ctx->net->nft.base_seq; - - list_for_each_entry_rcu(table, &ctx->afi->tables, list) { - if (cur_table) { - if (cur_table != table) - continue; - - cur_table = NULL; - } - ctx->table = table; - idx = 0; - list_for_each_entry_rcu(set, &ctx->table->sets, list) { - if (idx < s_idx) - goto cont; - if (nf_tables_fill_set(skb, ctx, set, NFT_MSG_NEWSET, - NLM_F_MULTI) < 0) { - cb->args[0] = idx; - cb->args[2] = (unsigned long) table; - goto done; - } - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } - } - cb->args[1] = 1; -done: - rcu_read_unlock(); - return skb->len; -} - -static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, - struct netlink_callback *cb) +static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) { const struct nft_set *set; unsigned int idx, s_idx = cb->args[0]; @@ -2328,6 +2403,7 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; struct net *net = sock_net(skb->sk); int cur_family = cb->args[3]; + struct nft_ctx *ctx = cb->data, ctx_set; if (cb->args[1]) return skb->len; @@ -2336,28 +2412,34 @@ static int nf_tables_dump_sets_all(struct nft_ctx *ctx, struct sk_buff *skb, cb->seq = net->nft.base_seq; list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (ctx->afi && ctx->afi != afi) + continue; + if (cur_family) { if (afi->family != cur_family) continue; cur_family = 0; } - list_for_each_entry_rcu(table, &afi->tables, list) { + if (ctx->table && ctx->table != table) + continue; + if (cur_table) { if (cur_table != table) continue; cur_table = NULL; } - - ctx->table = table; - ctx->afi = afi; idx = 0; - list_for_each_entry_rcu(set, &ctx->table->sets, list) { + list_for_each_entry_rcu(set, &table->sets, list) { if (idx < s_idx) goto cont; - if (nf_tables_fill_set(skb, ctx, set, + + ctx_set = *ctx; + ctx_set.table = table; + ctx_set.afi = afi; + if (nf_tables_fill_set(skb, &ctx_set, set, NFT_MSG_NEWSET, NLM_F_MULTI) < 0) { cb->args[0] = idx; @@ -2379,35 +2461,12 @@ done: return skb->len; } -static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) +static int nf_tables_dump_sets_done(struct netlink_callback *cb) { - const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - struct nlattr *nla[NFTA_SET_MAX + 1]; - struct nft_ctx ctx; - int err, ret; - - err = nlmsg_parse(cb->nlh, sizeof(*nfmsg), nla, NFTA_SET_MAX, - nft_set_policy); - if (err < 0) - return err; - - err = nft_ctx_init_from_setattr(&ctx, cb->skb, cb->nlh, (void *)nla); - if (err < 0) - return err; - - if (ctx.table == NULL) { - if (ctx.afi == NULL) - ret = nf_tables_dump_sets_all(&ctx, skb, cb); - else - ret = nf_tables_dump_sets_family(&ctx, skb, cb); - } else - ret = nf_tables_dump_sets_table(&ctx, skb, cb); - - return ret; + kfree(cb->data); + return 0; } -#define NFT_SET_INACTIVE (1 << 15) /* Internal set flag */ - static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2426,7 +2485,17 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_sets, + .done = nf_tables_dump_sets_done, }; + struct nft_ctx *ctx_dump; + + ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_KERNEL); + if (ctx_dump == NULL) + return -ENOMEM; + + *ctx_dump = ctx; + c.data = ctx_dump; + return netlink_dump_start(nlsk, skb, nlh, &c); } @@ -2472,26 +2541,6 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, return 0; } -static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, - struct nft_set *set) -{ - struct nft_trans *trans; - - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set)); - if (trans == NULL) - return -ENOMEM; - - if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) { - nft_trans_set_id(trans) = - ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID])); - set->flags |= NFT_SET_INACTIVE; - } - nft_trans_set(trans) = set; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); - - return 0; -} - static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2625,6 +2674,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, set->dlen = desc.dlen; set->flags = flags; set->size = desc.size; + set->policy = policy; err = ops->init(set, &desc, nla); if (err < 0) @@ -2685,13 +2735,7 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, if (!list_empty(&set->bindings)) return -EBUSY; - err = nft_trans_set_add(&ctx, NFT_MSG_DELSET, set); - if (err < 0) - return err; - - list_del_rcu(&set->list); - ctx.table->use--; - return 0; + return nft_delset(&ctx, set); } static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, @@ -2889,7 +2933,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx.afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx.net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) goto nla_put_failure; @@ -2970,7 +3014,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx->afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; @@ -3150,6 +3194,9 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, struct nft_ctx ctx; int rem, err = 0; + if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) + return -EINVAL; + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); if (err < 0) return err; @@ -3208,16 +3255,14 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, goto err2; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); - if (trans == NULL) + if (trans == NULL) { + err = -ENOMEM; goto err2; + } nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); - - nft_data_uninit(&elem.key, NFT_DATA_VALUE); - if (set->flags & NFT_SET_MAP) - nft_data_uninit(&elem.data, set->dtype); - + return 0; err2: nft_data_uninit(&elem.key, desc.type); err1: @@ -3233,6 +3278,9 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, struct nft_ctx ctx; int rem, err = 0; + if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) + return -EINVAL; + err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); if (err < 0) return err; @@ -3253,6 +3301,87 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, return err; } +static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWGEN; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); + + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq))) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -EMSGSIZE; +} + +static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) +{ + struct nlmsghdr *nlh = nlmsg_hdr(skb); + struct sk_buff *skb2; + int err; + + if (nlmsg_report(nlh) && + !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + goto err; + + err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq); + if (err < 0) { + kfree_skb(skb2); + goto err; + } + + err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, + NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + +static int nf_tables_getgen(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + struct net *net = sock_net(skb->sk); + struct sk_buff *skb2; + int err; + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); +err: + kfree_skb(skb2); + return err; +} + static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { .call_batch = nf_tables_newtable, @@ -3329,6 +3458,9 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_GETGEN] = { + .call = nf_tables_getgen, + }, }; static void nft_chain_commit_update(struct nft_trans *trans) @@ -3380,7 +3512,7 @@ static int nf_tables_commit(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); struct nft_trans *trans, *next; - struct nft_set *set; + struct nft_trans_elem *te; /* Bump generation counter, invalidate any dump in progress */ while (++net->nft.base_seq == 0); @@ -3422,11 +3554,9 @@ static int nf_tables_commit(struct sk_buff *skb) break; case NFT_MSG_DELCHAIN: nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN); - if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && - trans->ctx.chain->flags & NFT_BASE_CHAIN) { - nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, - trans->ctx.afi->nops); - } + nf_tables_unregister_hooks(trans->ctx.table, + trans->ctx.chain, + trans->ctx.afi->nops); break; case NFT_MSG_NEWRULE: nft_rule_clear(trans->ctx.net, nft_trans_rule(trans)); @@ -3466,13 +3596,17 @@ static int nf_tables_commit(struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: - nf_tables_setelem_notify(&trans->ctx, - nft_trans_elem_set(trans), - &nft_trans_elem(trans), + te = (struct nft_trans_elem *)trans->data; + nf_tables_setelem_notify(&trans->ctx, te->set, + &te->elem, NFT_MSG_DELSETELEM, 0); - set = nft_trans_elem_set(trans); - set->ops->get(set, &nft_trans_elem(trans)); - set->ops->remove(set, &nft_trans_elem(trans)); + te->set->ops->get(te->set, &te->elem); + te->set->ops->remove(te->set, &te->elem); + nft_data_uninit(&te->elem.key, NFT_DATA_VALUE); + if (te->elem.flags & NFT_SET_MAP) { + nft_data_uninit(&te->elem.data, + te->set->dtype); + } nft_trans_destroy(trans); break; } @@ -3484,6 +3618,8 @@ static int nf_tables_commit(struct sk_buff *skb) call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); } + nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + return 0; } @@ -3545,11 +3681,9 @@ static int nf_tables_abort(struct sk_buff *skb) } else { trans->ctx.table->use--; list_del_rcu(&trans->ctx.chain->list); - if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT) && - trans->ctx.chain->flags & NFT_BASE_CHAIN) { - nf_unregister_hooks(nft_base_chain(trans->ctx.chain)->ops, - trans->ctx.afi->nops); - } + nf_tables_unregister_hooks(trans->ctx.table, + trans->ctx.chain, + trans->ctx.afi->nops); } break; case NFT_MSG_DELCHAIN: @@ -4029,6 +4163,7 @@ static void __exit nf_tables_module_exit(void) { unregister_pernet_subsys(&nf_tables_net_ops); nfnetlink_subsys_unregister(&nf_tables_subsys); + rcu_barrier(); nf_tables_core_module_exit(); kfree(info); } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index c138b8fbe280..6c5a915cfa75 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -222,6 +222,51 @@ replay: } } +struct nfnl_err { + struct list_head head; + struct nlmsghdr *nlh; + int err; +}; + +static int nfnl_err_add(struct list_head *list, struct nlmsghdr *nlh, int err) +{ + struct nfnl_err *nfnl_err; + + nfnl_err = kmalloc(sizeof(struct nfnl_err), GFP_KERNEL); + if (nfnl_err == NULL) + return -ENOMEM; + + nfnl_err->nlh = nlh; + nfnl_err->err = err; + list_add_tail(&nfnl_err->head, list); + + return 0; +} + +static void nfnl_err_del(struct nfnl_err *nfnl_err) +{ + list_del(&nfnl_err->head); + kfree(nfnl_err); +} + +static void nfnl_err_reset(struct list_head *err_list) +{ + struct nfnl_err *nfnl_err, *next; + + list_for_each_entry_safe(nfnl_err, next, err_list, head) + nfnl_err_del(nfnl_err); +} + +static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb) +{ + struct nfnl_err *nfnl_err, *next; + + list_for_each_entry_safe(nfnl_err, next, err_list, head) { + netlink_ack(skb, nfnl_err->nlh, nfnl_err->err); + nfnl_err_del(nfnl_err); + } +} + static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, u_int16_t subsys_id) { @@ -230,6 +275,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; bool success = true, done = false; + static LIST_HEAD(err_list); int err; if (subsys_id >= NFNL_SUBSYS_COUNT) @@ -287,6 +333,7 @@ replay: type = nlh->nlmsg_type; if (type == NFNL_MSG_BATCH_BEGIN) { /* Malformed: Batch begin twice */ + nfnl_err_reset(&err_list); success = false; goto done; } else if (type == NFNL_MSG_BATCH_END) { @@ -333,7 +380,8 @@ replay: * original skb. */ if (err == -EAGAIN) { - ss->abort(skb); + nfnl_err_reset(&err_list); + ss->abort(oskb); nfnl_unlock(subsys_id); kfree_skb(nskb); goto replay; @@ -341,11 +389,24 @@ replay: } ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) { + /* Errors are delivered once the full batch has been + * processed, this avoids that the same error is + * reported several times when replaying the batch. + */ + if (nfnl_err_add(&err_list, nlh, err) < 0) { + /* We failed to enqueue an error, reset the + * list of errors and send OOM to userspace + * pointing to the batch header. + */ + nfnl_err_reset(&err_list); + netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM); + success = false; + goto done; + } /* We don't stop processing the batch on errors, thus, * userspace gets all the errors that the batch * triggers. */ - netlink_ack(skb, nlh, err); if (err) success = false; } @@ -357,10 +418,11 @@ ack: } done: if (success && done) - ss->commit(skb); + ss->commit(oskb); else - ss->abort(skb); + ss->abort(oskb); + nfnl_err_deliver(&err_list, oskb); nfnl_unlock(subsys_id); kfree_skb(nskb); } diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 2baa125c2e8d..c18af2f63eef 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -40,7 +40,13 @@ struct nf_acct { char data[0]; }; +struct nfacct_filter { + u32 value; + u32 mask; +}; + #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) +#define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ static int nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, @@ -77,7 +83,8 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, smp_mb__before_atomic(); /* reset overquota flag if quota is enabled. */ if ((matching->flags & NFACCT_F_QUOTA)) - clear_bit(NFACCT_F_OVERQUOTA, &matching->flags); + clear_bit(NFACCT_OVERQUOTA_BIT, + &matching->flags); return 0; } return -EBUSY; @@ -129,6 +136,7 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; u64 pkts, bytes; + u32 old_flags; event |= NFNL_SUBSYS_ACCT << 8; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); @@ -143,12 +151,13 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, if (nla_put_string(skb, NFACCT_NAME, acct->name)) goto nla_put_failure; + old_flags = acct->flags; if (type == NFNL_MSG_ACCT_GET_CTRZERO) { pkts = atomic64_xchg(&acct->pkts, 0); bytes = atomic64_xchg(&acct->bytes, 0); smp_mb__before_atomic(); if (acct->flags & NFACCT_F_QUOTA) - clear_bit(NFACCT_F_OVERQUOTA, &acct->flags); + clear_bit(NFACCT_OVERQUOTA_BIT, &acct->flags); } else { pkts = atomic64_read(&acct->pkts); bytes = atomic64_read(&acct->bytes); @@ -160,7 +169,7 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, if (acct->flags & NFACCT_F_QUOTA) { u64 *quota = (u64 *)acct->data; - if (nla_put_be32(skb, NFACCT_FLAGS, htonl(acct->flags)) || + if (nla_put_be32(skb, NFACCT_FLAGS, htonl(old_flags)) || nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota))) goto nla_put_failure; } @@ -177,6 +186,7 @@ static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nf_acct *cur, *last; + const struct nfacct_filter *filter = cb->data; if (cb->args[2]) return 0; @@ -193,6 +203,10 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) last = NULL; } + + if (filter && (cur->flags & filter->mask) != filter->value) + continue; + if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), @@ -207,6 +221,38 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int nfnl_acct_done(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + +static const struct nla_policy filter_policy[NFACCT_FILTER_MAX + 1] = { + [NFACCT_FILTER_MASK] = { .type = NLA_U32 }, + [NFACCT_FILTER_VALUE] = { .type = NLA_U32 }, +}; + +static struct nfacct_filter * +nfacct_filter_alloc(const struct nlattr * const attr) +{ + struct nfacct_filter *filter; + struct nlattr *tb[NFACCT_FILTER_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy); + if (err < 0) + return ERR_PTR(err); + + filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); + + filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK])); + filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE])); + + return filter; +} + static int nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) @@ -218,7 +264,18 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_acct_dump, + .done = nfnl_acct_done, }; + + if (tb[NFACCT_FILTER]) { + struct nfacct_filter *filter; + + filter = nfacct_filter_alloc(tb[NFACCT_FILTER]); + if (IS_ERR(filter)) + return PTR_ERR(filter); + + c.data = filter; + } return netlink_dump_start(nfnl, skb, nlh, &c); } @@ -310,6 +367,7 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { [NFACCT_PKTS] = { .type = NLA_U64 }, [NFACCT_FLAGS] = { .type = NLA_U32 }, [NFACCT_QUOTA] = { .type = NLA_U64 }, + [NFACCT_FILTER] = {.type = NLA_NESTED }, }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { @@ -412,7 +470,7 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) ret = now > *quota; if (now >= *quota && - !test_and_set_bit(NFACCT_F_OVERQUOTA, &nfacct->flags)) { + !test_and_set_bit(NFACCT_OVERQUOTA_BIT, &nfacct->flags)) { nfnl_overquota_report(nfacct); } diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index d292c8d286eb..b1e3a0579416 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -36,7 +36,7 @@ #include <linux/atomic.h> -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif @@ -429,7 +429,7 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; if (indev) { -#ifndef CONFIG_BRIDGE_NETFILTER +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; @@ -460,7 +460,7 @@ __build_packet_message(struct nfnl_log_net *log, } if (outdev) { -#ifndef CONFIG_BRIDGE_NETFILTER +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; @@ -640,7 +640,7 @@ nfulnl_log_packet(struct net *net, + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif @@ -773,6 +773,7 @@ nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, static struct nf_logger nfulnl_logger __read_mostly = { .name = "nfnetlink_log", + .type = NF_LOG_TYPE_ULOG, .logfn = &nfulnl_log_packet, .me = THIS_MODULE, }; @@ -1105,6 +1106,9 @@ MODULE_DESCRIPTION("netfilter userspace logging"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG); +MODULE_ALIAS_NF_LOGGER(AF_INET, 1); +MODULE_ALIAS_NF_LOGGER(AF_INET6, 1); +MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1); module_init(nfnetlink_log_init); module_exit(nfnetlink_log_fini); diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 108120f216b1..a82077d9f59b 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -36,7 +36,7 @@ #include <linux/atomic.h> -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif @@ -302,7 +302,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif @@ -380,7 +380,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, indev = entry->indev; if (indev) { -#ifndef CONFIG_BRIDGE_NETFILTER +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; #else @@ -410,7 +410,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } if (outdev) { -#ifndef CONFIG_BRIDGE_NETFILTER +#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; #else @@ -569,7 +569,7 @@ nf_queue_entry_dup(struct nf_queue_entry *e) return NULL; } -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* When called from bridge netfilter, skb->data must point to MAC header * before calling skb_gso_segment(). Else, original MAC header is lost * and segmented skbs will be sent to wrong destination. @@ -763,7 +763,7 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) if (entry->outdev) if (entry->outdev->ifindex == ifindex) return 1; -#ifdef CONFIG_BRIDGE_NETFILTER +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (entry->skb->nf_bridge) { if (entry->skb->nf_bridge->physindev && entry->skb->nf_bridge->physindev->ifindex == ifindex) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 1840989092ed..7e2683c8a44a 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -101,26 +101,12 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, static void target_compat_from_user(struct xt_target *t, void *in, void *out) { -#ifdef CONFIG_COMPAT - if (t->compat_from_user) { - int pad; - - t->compat_from_user(out, in); - pad = XT_ALIGN(t->targetsize) - t->targetsize; - if (pad > 0) - memset(out + t->targetsize, 0, pad); - } else -#endif - memcpy(out, in, XT_ALIGN(t->targetsize)); -} + int pad; -static inline int nft_compat_target_offset(struct xt_target *target) -{ -#ifdef CONFIG_COMPAT - return xt_compat_target_offset(target); -#else - return 0; -#endif + memcpy(out, in, t->targetsize); + pad = XT_ALIGN(t->targetsize) - t->targetsize; + if (pad > 0) + memset(out + t->targetsize, 0, pad); } static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = { @@ -208,34 +194,6 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) module_put(target->me); } -static int -target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in) -{ - int ret; - -#ifdef CONFIG_COMPAT - if (t->compat_to_user) { - mm_segment_t old_fs; - void *out; - - out = kmalloc(XT_ALIGN(t->targetsize), GFP_ATOMIC); - if (out == NULL) - return -ENOMEM; - - /* We want to reuse existing compat_to_user */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - t->compat_to_user(out, in); - set_fs(old_fs); - ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), out); - kfree(out); - } else -#endif - ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), in); - - return ret; -} - static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct xt_target *target = expr->ops->data; @@ -243,7 +201,7 @@ static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) || nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) || - target_dump_info(skb, target, info)) + nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(target->targetsize), info)) goto nla_put_failure; return 0; @@ -341,17 +299,12 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, static void match_compat_from_user(struct xt_match *m, void *in, void *out) { -#ifdef CONFIG_COMPAT - if (m->compat_from_user) { - int pad; - - m->compat_from_user(out, in); - pad = XT_ALIGN(m->matchsize) - m->matchsize; - if (pad > 0) - memset(out + m->matchsize, 0, pad); - } else -#endif - memcpy(out, in, XT_ALIGN(m->matchsize)); + int pad; + + memcpy(out, in, m->matchsize); + pad = XT_ALIGN(m->matchsize) - m->matchsize; + if (pad > 0) + memset(out + m->matchsize, 0, pad); } static int @@ -404,43 +357,6 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) module_put(match->me); } -static int -match_dump_info(struct sk_buff *skb, const struct xt_match *m, const void *in) -{ - int ret; - -#ifdef CONFIG_COMPAT - if (m->compat_to_user) { - mm_segment_t old_fs; - void *out; - - out = kmalloc(XT_ALIGN(m->matchsize), GFP_ATOMIC); - if (out == NULL) - return -ENOMEM; - - /* We want to reuse existing compat_to_user */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - m->compat_to_user(out, in); - set_fs(old_fs); - ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), out); - kfree(out); - } else -#endif - ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), in); - - return ret; -} - -static inline int nft_compat_match_offset(struct xt_match *match) -{ -#ifdef CONFIG_COMPAT - return xt_compat_match_offset(match); -#else - return 0; -#endif -} - static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) { void *info = nft_expr_priv(expr); @@ -448,7 +364,7 @@ static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) || nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) || - match_dump_info(skb, match, info)) + nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(match->matchsize), info)) goto nla_put_failure; return 0; @@ -643,8 +559,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-ENOMEM); nft_match->ops.type = &nft_match_type; - nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize) + - nft_compat_match_offset(match)); + nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize)); nft_match->ops.eval = nft_match_eval; nft_match->ops.init = nft_match_init; nft_match->ops.destroy = nft_match_destroy; @@ -714,8 +629,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-ENOMEM); nft_target->ops.type = &nft_target_type; - nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize) + - nft_compat_target_offset(target)); + nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize)); nft_target->ops.eval = nft_target_eval; nft_target->ops.init = nft_target_init; nft_target->ops.destroy = nft_target_destroy; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 4080ed6a072b..8892b7b6184a 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -15,209 +15,40 @@ #include <linux/log2.h> #include <linux/jhash.h> #include <linux/netlink.h> -#include <linux/vmalloc.h> +#include <linux/rhashtable.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> -#define NFT_HASH_MIN_SIZE 4UL - -struct nft_hash { - struct nft_hash_table __rcu *tbl; -}; - -struct nft_hash_table { - unsigned int size; - struct nft_hash_elem __rcu *buckets[]; -}; +/* We target a hash table size of 4, element hint is 75% of final size */ +#define NFT_HASH_ELEMENT_HINT 3 struct nft_hash_elem { - struct nft_hash_elem __rcu *next; + struct rhash_head node; struct nft_data key; struct nft_data data[]; }; -#define nft_hash_for_each_entry(i, head) \ - for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next)) -#define nft_hash_for_each_entry_rcu(i, head) \ - for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next)) - -static u32 nft_hash_rnd __read_mostly; -static bool nft_hash_rnd_initted __read_mostly; - -static unsigned int nft_hash_data(const struct nft_data *data, - unsigned int hsize, unsigned int len) -{ - unsigned int h; - - h = jhash(data->data, len, nft_hash_rnd); - return h & (hsize - 1); -} - static bool nft_hash_lookup(const struct nft_set *set, const struct nft_data *key, struct nft_data *data) { - const struct nft_hash *priv = nft_set_priv(set); - const struct nft_hash_table *tbl = rcu_dereference(priv->tbl); + const struct rhashtable *priv = nft_set_priv(set); const struct nft_hash_elem *he; - unsigned int h; - - h = nft_hash_data(key, tbl->size, set->klen); - nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) { - if (nft_data_cmp(&he->key, key, set->klen)) - continue; - if (set->flags & NFT_SET_MAP) - nft_data_copy(data, he->data); - return true; - } - return false; -} - -static void nft_hash_tbl_free(const struct nft_hash_table *tbl) -{ - kvfree(tbl); -} - -static unsigned int nft_hash_tbl_size(unsigned int nelem) -{ - return max(roundup_pow_of_two(nelem * 4 / 3), NFT_HASH_MIN_SIZE); -} - -static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets) -{ - struct nft_hash_table *tbl; - size_t size; - - size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); - tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN); - if (tbl == NULL) - tbl = vzalloc(size); - if (tbl == NULL) - return NULL; - tbl->size = nbuckets; - - return tbl; -} - -static void nft_hash_chain_unzip(const struct nft_set *set, - const struct nft_hash_table *ntbl, - struct nft_hash_table *tbl, unsigned int n) -{ - struct nft_hash_elem *he, *last, *next; - unsigned int h; - - he = nft_dereference(tbl->buckets[n]); - if (he == NULL) - return; - h = nft_hash_data(&he->key, ntbl->size, set->klen); - - /* Find last element of first chain hashing to bucket h */ - last = he; - nft_hash_for_each_entry(he, he->next) { - if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) - break; - last = he; - } - /* Unlink first chain from the old table */ - RCU_INIT_POINTER(tbl->buckets[n], last->next); + he = rhashtable_lookup(priv, key); + if (he && set->flags & NFT_SET_MAP) + nft_data_copy(data, he->data); - /* If end of chain reached, done */ - if (he == NULL) - return; - - /* Find first element of second chain hashing to bucket h */ - next = NULL; - nft_hash_for_each_entry(he, he->next) { - if (nft_hash_data(&he->key, ntbl->size, set->klen) != h) - continue; - next = he; - break; - } - - /* Link the two chains */ - RCU_INIT_POINTER(last->next, next); -} - -static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv) -{ - struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; - struct nft_hash_elem *he; - unsigned int i, h; - bool complete; - - ntbl = nft_hash_tbl_alloc(tbl->size * 2); - if (ntbl == NULL) - return -ENOMEM; - - /* Link new table's buckets to first element in the old table - * hashing to the new bucket. - */ - for (i = 0; i < ntbl->size; i++) { - h = i < tbl->size ? i : i - tbl->size; - nft_hash_for_each_entry(he, tbl->buckets[h]) { - if (nft_hash_data(&he->key, ntbl->size, set->klen) != i) - continue; - RCU_INIT_POINTER(ntbl->buckets[i], he); - break; - } - } - - /* Publish new table */ - rcu_assign_pointer(priv->tbl, ntbl); - - /* Unzip interleaved hash chains */ - do { - /* Wait for readers to use new table/unzipped chains */ - synchronize_rcu(); - - complete = true; - for (i = 0; i < tbl->size; i++) { - nft_hash_chain_unzip(set, ntbl, tbl, i); - if (tbl->buckets[i] != NULL) - complete = false; - } - } while (!complete); - - nft_hash_tbl_free(tbl); - return 0; -} - -static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv) -{ - struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl; - struct nft_hash_elem __rcu **pprev; - unsigned int i; - - ntbl = nft_hash_tbl_alloc(tbl->size / 2); - if (ntbl == NULL) - return -ENOMEM; - - for (i = 0; i < ntbl->size; i++) { - ntbl->buckets[i] = tbl->buckets[i]; - - for (pprev = &ntbl->buckets[i]; *pprev != NULL; - pprev = &nft_dereference(*pprev)->next) - ; - RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]); - } - - /* Publish new table */ - rcu_assign_pointer(priv->tbl, ntbl); - synchronize_rcu(); - - nft_hash_tbl_free(tbl); - return 0; + return !!he; } static int nft_hash_insert(const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_table *tbl = nft_dereference(priv->tbl); + struct rhashtable *priv = nft_set_priv(set); struct nft_hash_elem *he; - unsigned int size, h; + unsigned int size; if (elem->flags != 0) return -EINVAL; @@ -234,13 +65,7 @@ static int nft_hash_insert(const struct nft_set *set, if (set->flags & NFT_SET_MAP) nft_data_copy(he->data, &elem->data); - h = nft_hash_data(&he->key, tbl->size, set->klen); - RCU_INIT_POINTER(he->next, tbl->buckets[h]); - rcu_assign_pointer(tbl->buckets[h], he); - - /* Expand table when exceeding 75% load */ - if (set->nelems + 1 > tbl->size / 4 * 3) - nft_hash_tbl_expand(set, priv); + rhashtable_insert(priv, &he->node, GFP_KERNEL); return 0; } @@ -257,36 +82,31 @@ static void nft_hash_elem_destroy(const struct nft_set *set, static void nft_hash_remove(const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_table *tbl = nft_dereference(priv->tbl); - struct nft_hash_elem *he, __rcu **pprev; + struct rhashtable *priv = nft_set_priv(set); + struct rhash_head *he, __rcu **pprev; pprev = elem->cookie; - he = nft_dereference((*pprev)); + he = rht_dereference((*pprev), priv); + + rhashtable_remove_pprev(priv, he, pprev, GFP_KERNEL); - RCU_INIT_POINTER(*pprev, he->next); synchronize_rcu(); kfree(he); - - /* Shrink table beneath 30% load */ - if (set->nelems - 1 < tbl->size * 3 / 10 && - tbl->size > NFT_HASH_MIN_SIZE) - nft_hash_tbl_shrink(set, priv); } static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) { - const struct nft_hash *priv = nft_set_priv(set); - const struct nft_hash_table *tbl = nft_dereference(priv->tbl); - struct nft_hash_elem __rcu * const *pprev; + const struct rhashtable *priv = nft_set_priv(set); + const struct bucket_table *tbl = rht_dereference_rcu(priv->tbl, priv); + struct rhash_head __rcu * const *pprev; struct nft_hash_elem *he; - unsigned int h; + u32 h; - h = nft_hash_data(&elem->key, tbl->size, set->klen); + h = rhashtable_hashfn(priv, &elem->key, set->klen); pprev = &tbl->buckets[h]; - nft_hash_for_each_entry(he, tbl->buckets[h]) { + rht_for_each_entry_rcu(he, tbl->buckets[h], node) { if (nft_data_cmp(&he->key, &elem->key, set->klen)) { - pprev = &he->next; + pprev = &he->node.next; continue; } @@ -302,14 +122,15 @@ static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, struct nft_set_iter *iter) { - const struct nft_hash *priv = nft_set_priv(set); - const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + const struct rhashtable *priv = nft_set_priv(set); + const struct bucket_table *tbl; const struct nft_hash_elem *he; struct nft_set_elem elem; unsigned int i; + tbl = rht_dereference_rcu(priv->tbl, priv); for (i = 0; i < tbl->size; i++) { - nft_hash_for_each_entry(he, tbl->buckets[i]) { + rht_for_each_entry_rcu(he, tbl->buckets[i], node) { if (iter->count < iter->skip) goto cont; @@ -329,48 +150,48 @@ cont: static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) { - return sizeof(struct nft_hash); + return sizeof(struct rhashtable); +} + +static int lockdep_nfnl_lock_is_held(void) +{ + return lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES); } static int nft_hash_init(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const tb[]) { - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_table *tbl; - unsigned int size; + struct rhashtable *priv = nft_set_priv(set); + struct rhashtable_params params = { + .nelem_hint = desc->size ? : NFT_HASH_ELEMENT_HINT, + .head_offset = offsetof(struct nft_hash_elem, node), + .key_offset = offsetof(struct nft_hash_elem, key), + .key_len = set->klen, + .hashfn = jhash, + .grow_decision = rht_grow_above_75, + .shrink_decision = rht_shrink_below_30, + .mutex_is_held = lockdep_nfnl_lock_is_held, + }; - if (unlikely(!nft_hash_rnd_initted)) { - get_random_bytes(&nft_hash_rnd, 4); - nft_hash_rnd_initted = true; - } - - size = NFT_HASH_MIN_SIZE; - if (desc->size) - size = nft_hash_tbl_size(desc->size); - - tbl = nft_hash_tbl_alloc(size); - if (tbl == NULL) - return -ENOMEM; - RCU_INIT_POINTER(priv->tbl, tbl); - return 0; + return rhashtable_init(priv, ¶ms); } static void nft_hash_destroy(const struct nft_set *set) { - const struct nft_hash *priv = nft_set_priv(set); - const struct nft_hash_table *tbl = nft_dereference(priv->tbl); + const struct rhashtable *priv = nft_set_priv(set); + const struct bucket_table *tbl = priv->tbl; struct nft_hash_elem *he, *next; unsigned int i; for (i = 0; i < tbl->size; i++) { - for (he = nft_dereference(tbl->buckets[i]); he != NULL; - he = next) { - next = nft_dereference(he->next); + for (he = rht_entry(tbl->buckets[i], struct nft_hash_elem, node); + he != NULL; he = next) { + next = rht_entry(he->node.next, struct nft_hash_elem, node); nft_hash_elem_destroy(set, he); } } - kfree(tbl); + rhashtable_destroy(priv); } static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, @@ -383,8 +204,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]); if (desc->size) { - est->size = sizeof(struct nft_hash) + - nft_hash_tbl_size(desc->size) * + est->size = sizeof(struct rhashtable) + + roundup_pow_of_two(desc->size * 4 / 3) * sizeof(struct nft_hash_elem *) + desc->size * esize; } else { diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 10cfb156cdf4..bde05f28cf14 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012-2014 Pablo Neira Ayuso <pablo@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -41,6 +42,8 @@ static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { [NFTA_LOG_PREFIX] = { .type = NLA_STRING }, [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 }, [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 }, + [NFTA_LOG_LEVEL] = { .type = NLA_U32 }, + [NFTA_LOG_FLAGS] = { .type = NLA_U32 }, }; static int nft_log_init(const struct nft_ctx *ctx, @@ -50,6 +53,7 @@ static int nft_log_init(const struct nft_ctx *ctx, struct nft_log *priv = nft_expr_priv(expr); struct nf_loginfo *li = &priv->loginfo; const struct nlattr *nla; + int ret; nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { @@ -57,30 +61,74 @@ static int nft_log_init(const struct nft_ctx *ctx, if (priv->prefix == NULL) return -ENOMEM; nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1); - } else + } else { priv->prefix = (char *)nft_log_null_prefix; + } - li->type = NF_LOG_TYPE_ULOG; + li->type = NF_LOG_TYPE_LOG; + if (tb[NFTA_LOG_LEVEL] != NULL && + tb[NFTA_LOG_GROUP] != NULL) + return -EINVAL; if (tb[NFTA_LOG_GROUP] != NULL) + li->type = NF_LOG_TYPE_ULOG; + + switch (li->type) { + case NF_LOG_TYPE_LOG: + if (tb[NFTA_LOG_LEVEL] != NULL) { + li->u.log.level = + ntohl(nla_get_be32(tb[NFTA_LOG_LEVEL])); + } else { + li->u.log.level = 4; + } + if (tb[NFTA_LOG_FLAGS] != NULL) { + li->u.log.logflags = + ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS])); + } + break; + case NF_LOG_TYPE_ULOG: li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP])); + if (tb[NFTA_LOG_SNAPLEN] != NULL) { + li->u.ulog.copy_len = + ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); + } + if (tb[NFTA_LOG_QTHRESHOLD] != NULL) { + li->u.ulog.qthreshold = + ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD])); + } + break; + } - if (tb[NFTA_LOG_SNAPLEN] != NULL) - li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); - if (tb[NFTA_LOG_QTHRESHOLD] != NULL) { - li->u.ulog.qthreshold = - ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD])); + if (ctx->afi->family == NFPROTO_INET) { + ret = nf_logger_find_get(NFPROTO_IPV4, li->type); + if (ret < 0) + return ret; + + ret = nf_logger_find_get(NFPROTO_IPV6, li->type); + if (ret < 0) { + nf_logger_put(NFPROTO_IPV4, li->type); + return ret; + } + return 0; } - return 0; + return nf_logger_find_get(ctx->afi->family, li->type); } static void nft_log_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_log *priv = nft_expr_priv(expr); + struct nf_loginfo *li = &priv->loginfo; if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); + + if (ctx->afi->family == NFPROTO_INET) { + nf_logger_put(NFPROTO_IPV4, li->type); + nf_logger_put(NFPROTO_IPV6, li->type); + } else { + nf_logger_put(ctx->afi->family, li->type); + } } static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -91,17 +139,33 @@ static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) if (priv->prefix != nft_log_null_prefix) if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix)) goto nla_put_failure; - if (li->u.ulog.group) - if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) - goto nla_put_failure; - if (li->u.ulog.copy_len) - if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, - htonl(li->u.ulog.copy_len))) + switch (li->type) { + case NF_LOG_TYPE_LOG: + if (nla_put_be32(skb, NFTA_LOG_LEVEL, htonl(li->u.log.level))) goto nla_put_failure; - if (li->u.ulog.qthreshold) - if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD, - htons(li->u.ulog.qthreshold))) + + if (li->u.log.logflags) { + if (nla_put_be32(skb, NFTA_LOG_FLAGS, + htonl(li->u.log.logflags))) + goto nla_put_failure; + } + break; + case NF_LOG_TYPE_ULOG: + if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) goto nla_put_failure; + + if (li->u.ulog.copy_len) { + if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, + htonl(li->u.ulog.copy_len))) + goto nla_put_failure; + } + if (li->u.ulog.qthreshold) { + if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD, + htons(li->u.ulog.qthreshold))) + goto nla_put_failure; + } + break; + } return 0; nla_put_failure: diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c new file mode 100644 index 000000000000..6637bab00567 --- /dev/null +++ b/net/netfilter/nft_masq.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nft_masq.h> + +const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { + [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, +}; +EXPORT_SYMBOL_GPL(nft_masq_policy); + +int nft_masq_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_masq *priv = nft_expr_priv(expr); + + if (tb[NFTA_MASQ_FLAGS] == NULL) + return 0; + + priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_masq_init); + +int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_masq *priv = nft_expr_priv(expr); + + if (priv->flags == 0) + return 0; + + if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_masq_dump); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 852b178c6ae7..1e7c076ca63a 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -14,6 +14,10 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/smp.h> #include <net/dst.h> #include <net/sock.h> #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ @@ -124,6 +128,43 @@ void nft_meta_get_eval(const struct nft_expr *expr, dest->data[0] = skb->secmark; break; #endif + case NFT_META_PKTTYPE: + if (skb->pkt_type != PACKET_LOOPBACK) { + dest->data[0] = skb->pkt_type; + break; + } + + switch (pkt->ops->pf) { + case NFPROTO_IPV4: + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) + dest->data[0] = PACKET_MULTICAST; + else + dest->data[0] = PACKET_BROADCAST; + break; + case NFPROTO_IPV6: + if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) + dest->data[0] = PACKET_MULTICAST; + else + dest->data[0] = PACKET_BROADCAST; + break; + default: + WARN_ON(1); + goto err; + } + break; + case NFT_META_CPU: + dest->data[0] = smp_processor_id(); + break; + case NFT_META_IIFGROUP: + if (in == NULL) + goto err; + dest->data[0] = in->group; + break; + case NFT_META_OIFGROUP: + if (out == NULL) + goto err; + dest->data[0] = out->group; + break; default: WARN_ON(1); goto err; @@ -195,6 +236,10 @@ int nft_meta_get_init(const struct nft_ctx *ctx, #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: #endif + case NFT_META_PKTTYPE: + case NFT_META_CPU: + case NFT_META_IIFGROUP: + case NFT_META_OIFGROUP: break; default: return -EOPNOTSUPP; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 79ff58cd36dc..799550b476fb 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -33,6 +33,7 @@ struct nft_nat { enum nft_registers sreg_proto_max:8; enum nf_nat_manip_type type:8; u8 family; + u16 flags; }; static void nft_nat_eval(const struct nft_expr *expr, @@ -71,6 +72,8 @@ static void nft_nat_eval(const struct nft_expr *expr, range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } + range.flags |= priv->flags; + data[NFT_REG_VERDICT].verdict = nf_nat_setup_info(ct, &range, priv->type); } @@ -82,6 +85,7 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, + [NFTA_NAT_FLAGS] = { .type = NLA_U32 }, }; static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -149,6 +153,12 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } else priv->sreg_proto_max = priv->sreg_proto_min; + if (tb[NFTA_NAT_FLAGS]) { + priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + } + return 0; } @@ -183,6 +193,12 @@ static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) htonl(priv->sreg_proto_max))) goto nla_put_failure; } + + if (priv->flags != 0) { + if (nla_put_be32(skb, NFTA_NAT_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + } + return 0; nla_put_failure: diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index e1836ff88199..46214f245665 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -234,13 +234,11 @@ static void nft_rbtree_destroy(const struct nft_set *set) struct nft_rbtree_elem *rbe; struct rb_node *node; - spin_lock_bh(&nft_rbtree_lock); while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); nft_rbtree_elem_destroy(set, rbe); } - spin_unlock_bh(&nft_rbtree_lock); } static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index f3448c296446..57d3e1af5630 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -17,6 +17,8 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_reject.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, @@ -70,5 +72,38 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_reject_dump); +static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = { + [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH, + [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH, + [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH, + [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMP_PKT_FILTERED, +}; + +int nft_reject_icmp_code(u8 code) +{ + BUG_ON(code > NFT_REJECT_ICMPX_MAX); + + return icmp_code_v4[code]; +} + +EXPORT_SYMBOL_GPL(nft_reject_icmp_code); + + +static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = { + [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE, + [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH, + [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH, + [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMPV6_ADM_PROHIBITED, +}; + +int nft_reject_icmpv6_code(u8 code) +{ + BUG_ON(code > NFT_REJECT_ICMPX_MAX); + + return icmp_code_v6[code]; +} + +EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index b718a52a4654..7b5f9d58680a 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -14,17 +14,103 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_reject.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/ipv6/nf_reject.h> static void nft_reject_inet_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1], const struct nft_pktinfo *pkt) { + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + switch (pkt->ops->pf) { case NFPROTO_IPV4: - return nft_reject_ipv4_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach(pkt->skb, + nft_reject_icmp_code(priv->icmp_code)); + break; + } + break; case NFPROTO_IPV6: - return nft_reject_ipv6_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach6(net, pkt->skb, + nft_reject_icmpv6_code(priv->icmp_code), + pkt->ops->hooknum); + break; + } + break; + } + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static int nft_reject_inet_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + int icmp_code; + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + + icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + if (priv->type == NFT_REJECT_ICMPX_UNREACH && + icmp_code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + priv->icmp_code = icmp_code; + break; + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; } + return 0; +} + +static int nft_reject_inet_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; } static struct nft_expr_type nft_reject_inet_type; @@ -32,8 +118,8 @@ static const struct nft_expr_ops nft_reject_inet_ops = { .type = &nft_reject_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_inet_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, + .init = nft_reject_inet_init, + .dump = nft_reject_inet_dump, }; static struct nft_expr_type nft_reject_inet_type __read_mostly = { diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 227aa11e8409..133eb4772f12 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -71,18 +71,14 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = { static const unsigned int xt_jumpstack_multiplier = 2; /* Registration hooks for targets. */ -int -xt_register_target(struct xt_target *target) +int xt_register_target(struct xt_target *target) { u_int8_t af = target->family; - int ret; - ret = mutex_lock_interruptible(&xt[af].mutex); - if (ret != 0) - return ret; + mutex_lock(&xt[af].mutex); list_add(&target->list, &xt[af].target); mutex_unlock(&xt[af].mutex); - return ret; + return 0; } EXPORT_SYMBOL(xt_register_target); @@ -125,20 +121,14 @@ xt_unregister_targets(struct xt_target *target, unsigned int n) } EXPORT_SYMBOL(xt_unregister_targets); -int -xt_register_match(struct xt_match *match) +int xt_register_match(struct xt_match *match) { u_int8_t af = match->family; - int ret; - - ret = mutex_lock_interruptible(&xt[af].mutex); - if (ret != 0) - return ret; + mutex_lock(&xt[af].mutex); list_add(&match->list, &xt[af].match); mutex_unlock(&xt[af].mutex); - - return ret; + return 0; } EXPORT_SYMBOL(xt_register_match); @@ -194,9 +184,7 @@ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) struct xt_match *m; int err = -ENOENT; - if (mutex_lock_interruptible(&xt[af].mutex) != 0) - return ERR_PTR(-EINTR); - + mutex_lock(&xt[af].mutex); list_for_each_entry(m, &xt[af].match, list) { if (strcmp(m->name, name) == 0) { if (m->revision == revision) { @@ -239,9 +227,7 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) struct xt_target *t; int err = -ENOENT; - if (mutex_lock_interruptible(&xt[af].mutex) != 0) - return ERR_PTR(-EINTR); - + mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt[af].target, list) { if (strcmp(t->name, name) == 0) { if (t->revision == revision) { @@ -323,10 +309,7 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target, { int have_rev, best = -1; - if (mutex_lock_interruptible(&xt[af].mutex) != 0) { - *err = -EINTR; - return 1; - } + mutex_lock(&xt[af].mutex); if (target == 1) have_rev = target_revfn(af, name, revision, &best); else @@ -711,28 +694,15 @@ void xt_free_table_info(struct xt_table_info *info) { int cpu; - for_each_possible_cpu(cpu) { - if (info->size <= PAGE_SIZE) - kfree(info->entries[cpu]); - else - vfree(info->entries[cpu]); - } + for_each_possible_cpu(cpu) + kvfree(info->entries[cpu]); if (info->jumpstack != NULL) { - if (sizeof(void *) * info->stacksize > PAGE_SIZE) { - for_each_possible_cpu(cpu) - vfree(info->jumpstack[cpu]); - } else { - for_each_possible_cpu(cpu) - kfree(info->jumpstack[cpu]); - } + for_each_possible_cpu(cpu) + kvfree(info->jumpstack[cpu]); + kvfree(info->jumpstack); } - if (sizeof(void **) * nr_cpu_ids > PAGE_SIZE) - vfree(info->jumpstack); - else - kfree(info->jumpstack); - free_percpu(info->stackptr); kfree(info); @@ -745,9 +715,7 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, { struct xt_table *t; - if (mutex_lock_interruptible(&xt[af].mutex) != 0) - return ERR_PTR(-EINTR); - + mutex_lock(&xt[af].mutex); list_for_each_entry(t, &net->xt.tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; @@ -896,10 +864,7 @@ struct xt_table *xt_register_table(struct net *net, goto out; } - ret = mutex_lock_interruptible(&xt[table->af].mutex); - if (ret != 0) - goto out_free; - + mutex_lock(&xt[table->af].mutex); /* Don't autoload: we'd eat our tail... */ list_for_each_entry(t, &net->xt.tables[table->af], list) { if (strcmp(t->name, table->name) == 0) { @@ -924,9 +889,8 @@ struct xt_table *xt_register_table(struct net *net, mutex_unlock(&xt[table->af].mutex); return table; - unlock: +unlock: mutex_unlock(&xt[table->af].mutex); -out_free: kfree(table); out: return ERR_PTR(ret); @@ -1137,22 +1101,11 @@ static const struct seq_operations xt_match_seq_ops = { static int xt_match_open(struct inode *inode, struct file *file) { - struct seq_file *seq; struct nf_mttg_trav *trav; - int ret; - - trav = kmalloc(sizeof(*trav), GFP_KERNEL); - if (trav == NULL) + trav = __seq_open_private(file, &xt_match_seq_ops, sizeof(*trav)); + if (!trav) return -ENOMEM; - ret = seq_open(file, &xt_match_seq_ops); - if (ret < 0) { - kfree(trav); - return ret; - } - - seq = file->private_data; - seq->private = trav; trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } @@ -1201,22 +1154,11 @@ static const struct seq_operations xt_target_seq_ops = { static int xt_target_open(struct inode *inode, struct file *file) { - struct seq_file *seq; struct nf_mttg_trav *trav; - int ret; - - trav = kmalloc(sizeof(*trav), GFP_KERNEL); - if (trav == NULL) + trav = __seq_open_private(file, &xt_target_seq_ops, sizeof(*trav)); + if (!trav) return -ENOMEM; - ret = seq_open(file, &xt_target_seq_ops); - if (ret < 0) { - kfree(trav); - return ret; - } - - seq = file->private_data; - seq->private = trav; trav->nfproto = (unsigned long)PDE_DATA(inode); return 0; } diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index 73b73f687c58..02afaf48a729 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -126,7 +126,7 @@ hmark_hash(struct hmark_tuple *t, const struct xt_hmark_info *info) hash = jhash_3words(src, dst, t->uports.v32, info->hashrnd); hash = hash ^ (t->proto & info->proto_mask); - return (((u64)hash * info->hmodulus) >> 32) + info->hoffset; + return reciprocal_scale(hash, info->hmodulus) + info->hoffset; } static void diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 993de2ba89d3..3ba31c194cce 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -50,11 +50,14 @@ struct xt_led_info_internal { struct timer_list timer; }; +#define XT_LED_BLINK_DELAY 50 /* ms */ + static unsigned int led_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_led_info *ledinfo = par->targinfo; struct xt_led_info_internal *ledinternal = ledinfo->internal_data; + unsigned long led_delay = XT_LED_BLINK_DELAY; /* * If "always blink" is enabled, and there's still some time until the @@ -62,9 +65,10 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par) */ if ((ledinfo->delay > 0) && ledinfo->always_blink && timer_pending(&ledinternal->timer)) - led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); - - led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL); + led_trigger_blink_oneshot(&ledinternal->netfilter_led_trigger, + &led_delay, &led_delay, 1); + else + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL); /* If there's a positive delay, start/update the timer */ if (ledinfo->delay > 0) { @@ -133,9 +137,7 @@ static int led_tg_check(const struct xt_tgchk_param *par) err = led_trigger_register(&ledinternal->netfilter_led_trigger); if (err) { - pr_warning("led_trigger_register() failed\n"); - if (err == -EEXIST) - pr_warning("Trigger name is already in use.\n"); + pr_err("Trigger name is already in use.\n"); goto exit_alloc; } diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index 5ab24843370a..c13b79440ede 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -27,806 +27,6 @@ #include <linux/netfilter/xt_LOG.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/nf_log.h> -#include <net/netfilter/xt_log.h> - -static struct nf_loginfo default_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = 5, - .logflags = NF_LOG_MASK, - }, - }, -}; - -static int dump_udp_header(struct sbuff *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset) -{ - struct udphdr _udph; - const struct udphdr *uh; - - if (proto == IPPROTO_UDP) - /* Max length: 10 "PROTO=UDP " */ - sb_add(m, "PROTO=UDP "); - else /* Max length: 14 "PROTO=UDPLITE " */ - sb_add(m, "PROTO=UDPLITE "); - - if (fragment) - goto out; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); - if (uh == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); - - return 1; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - sb_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), - ntohs(uh->len)); - -out: - return 0; -} - -static int dump_tcp_header(struct sbuff *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset, - unsigned int logflags) -{ - struct tcphdr _tcph; - const struct tcphdr *th; - - /* Max length: 10 "PROTO=TCP " */ - sb_add(m, "PROTO=TCP "); - - if (fragment) - return 0; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); - if (th == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); - return 1; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - sb_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); - /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (logflags & XT_LOG_TCPSEQ) - sb_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); - - /* Max length: 13 "WINDOW=65535 " */ - sb_add(m, "WINDOW=%u ", ntohs(th->window)); - /* Max length: 9 "RES=0x3C " */ - sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & - TCP_RESERVED_BITS) >> 22)); - /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ - if (th->cwr) - sb_add(m, "CWR "); - if (th->ece) - sb_add(m, "ECE "); - if (th->urg) - sb_add(m, "URG "); - if (th->ack) - sb_add(m, "ACK "); - if (th->psh) - sb_add(m, "PSH "); - if (th->rst) - sb_add(m, "RST "); - if (th->syn) - sb_add(m, "SYN "); - if (th->fin) - sb_add(m, "FIN "); - /* Max length: 11 "URGP=65535 " */ - sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); - - if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { - u_int8_t _opt[60 - sizeof(struct tcphdr)]; - const u_int8_t *op; - unsigned int i; - unsigned int optsize = th->doff*4 - sizeof(struct tcphdr); - - op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), - optsize, _opt); - if (op == NULL) { - sb_add(m, "OPT (TRUNCATED)"); - return 1; - } - - /* Max length: 127 "OPT (" 15*4*2chars ") " */ - sb_add(m, "OPT ("); - for (i = 0; i < optsize; i++) - sb_add(m, "%02X", op[i]); - - sb_add(m, ") "); - } - - return 0; -} - -static void dump_sk_uid_gid(struct sbuff *m, struct sock *sk) -{ - if (!sk || sk->sk_state == TCP_TIME_WAIT) - return; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket && sk->sk_socket->file) { - const struct cred *cred = sk->sk_socket->file->f_cred; - sb_add(m, "UID=%u GID=%u ", - from_kuid_munged(&init_user_ns, cred->fsuid), - from_kgid_munged(&init_user_ns, cred->fsgid)); - } - read_unlock_bh(&sk->sk_callback_lock); -} - -/* One level of recursion won't kill us */ -static void dump_ipv4_packet(struct sbuff *m, - const struct nf_loginfo *info, - const struct sk_buff *skb, - unsigned int iphoff) -{ - struct iphdr _iph; - const struct iphdr *ih; - unsigned int logflags; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - else - logflags = NF_LOG_MASK; - - ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); - if (ih == NULL) { - sb_add(m, "TRUNCATED"); - return; - } - - /* Important fields: - * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ - /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ - sb_add(m, "SRC=%pI4 DST=%pI4 ", - &ih->saddr, &ih->daddr); - - /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ - sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", - ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, - ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); - - /* Max length: 6 "CE DF MF " */ - if (ntohs(ih->frag_off) & IP_CE) - sb_add(m, "CE "); - if (ntohs(ih->frag_off) & IP_DF) - sb_add(m, "DF "); - if (ntohs(ih->frag_off) & IP_MF) - sb_add(m, "MF "); - - /* Max length: 11 "FRAG:65535 " */ - if (ntohs(ih->frag_off) & IP_OFFSET) - sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - - if ((logflags & XT_LOG_IPOPT) && - ih->ihl * 4 > sizeof(struct iphdr)) { - const unsigned char *op; - unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; - unsigned int i, optsize; - - optsize = ih->ihl * 4 - sizeof(struct iphdr); - op = skb_header_pointer(skb, iphoff+sizeof(_iph), - optsize, _opt); - if (op == NULL) { - sb_add(m, "TRUNCATED"); - return; - } - - /* Max length: 127 "OPT (" 15*4*2chars ") " */ - sb_add(m, "OPT ("); - for (i = 0; i < optsize; i++) - sb_add(m, "%02X", op[i]); - sb_add(m, ") "); - } - - switch (ih->protocol) { - case IPPROTO_TCP: - if (dump_tcp_header(m, skb, ih->protocol, - ntohs(ih->frag_off) & IP_OFFSET, - iphoff+ih->ihl*4, logflags)) - return; - break; - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - if (dump_udp_header(m, skb, ih->protocol, - ntohs(ih->frag_off) & IP_OFFSET, - iphoff+ih->ihl*4)) - return; - break; - case IPPROTO_ICMP: { - struct icmphdr _icmph; - const struct icmphdr *ich; - static const size_t required_len[NR_ICMP_TYPES+1] - = { [ICMP_ECHOREPLY] = 4, - [ICMP_DEST_UNREACH] - = 8 + sizeof(struct iphdr), - [ICMP_SOURCE_QUENCH] - = 8 + sizeof(struct iphdr), - [ICMP_REDIRECT] - = 8 + sizeof(struct iphdr), - [ICMP_ECHO] = 4, - [ICMP_TIME_EXCEEDED] - = 8 + sizeof(struct iphdr), - [ICMP_PARAMETERPROB] - = 8 + sizeof(struct iphdr), - [ICMP_TIMESTAMP] = 20, - [ICMP_TIMESTAMPREPLY] = 20, - [ICMP_ADDRESS] = 12, - [ICMP_ADDRESSREPLY] = 12 }; - - /* Max length: 11 "PROTO=ICMP " */ - sb_add(m, "PROTO=ICMP "); - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, - sizeof(_icmph), &_icmph); - if (ich == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Max length: 18 "TYPE=255 CODE=255 " */ - sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (ich->type <= NR_ICMP_TYPES && - required_len[ich->type] && - skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { - sb_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - switch (ich->type) { - case ICMP_ECHOREPLY: - case ICMP_ECHO: - /* Max length: 19 "ID=65535 SEQ=65535 " */ - sb_add(m, "ID=%u SEQ=%u ", - ntohs(ich->un.echo.id), - ntohs(ich->un.echo.sequence)); - break; - - case ICMP_PARAMETERPROB: - /* Max length: 14 "PARAMETER=255 " */ - sb_add(m, "PARAMETER=%u ", - ntohl(ich->un.gateway) >> 24); - break; - case ICMP_REDIRECT: - /* Max length: 24 "GATEWAY=255.255.255.255 " */ - sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); - /* Fall through */ - case ICMP_DEST_UNREACH: - case ICMP_SOURCE_QUENCH: - case ICMP_TIME_EXCEEDED: - /* Max length: 3+maxlen */ - if (!iphoff) { /* Only recurse once. */ - sb_add(m, "["); - dump_ipv4_packet(m, info, skb, - iphoff + ih->ihl*4+sizeof(_icmph)); - sb_add(m, "] "); - } - - /* Max length: 10 "MTU=65535 " */ - if (ich->type == ICMP_DEST_UNREACH && - ich->code == ICMP_FRAG_NEEDED) - sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); - } - break; - } - /* Max Length */ - case IPPROTO_AH: { - struct ip_auth_hdr _ahdr; - const struct ip_auth_hdr *ah; - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 9 "PROTO=AH " */ - sb_add(m, "PROTO=AH "); - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ah = skb_header_pointer(skb, iphoff+ih->ihl*4, - sizeof(_ahdr), &_ahdr); - if (ah == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Length: 15 "SPI=0xF1234567 " */ - sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); - break; - } - case IPPROTO_ESP: { - struct ip_esp_hdr _esph; - const struct ip_esp_hdr *eh; - - /* Max length: 10 "PROTO=ESP " */ - sb_add(m, "PROTO=ESP "); - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - eh = skb_header_pointer(skb, iphoff+ih->ihl*4, - sizeof(_esph), &_esph); - if (eh == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Length: 15 "SPI=0xF1234567 " */ - sb_add(m, "SPI=0x%x ", ntohl(eh->spi)); - break; - } - /* Max length: 10 "PROTO 255 " */ - default: - sb_add(m, "PROTO=%u ", ih->protocol); - } - - /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && !iphoff) - dump_sk_uid_gid(m, skb->sk); - - /* Max length: 16 "MARK=0xFFFFFFFF " */ - if (!iphoff && skb->mark) - sb_add(m, "MARK=0x%x ", skb->mark); - - /* Proto Max log string length */ - /* IP: 40+46+6+11+127 = 230 */ - /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ - /* UDP: 10+max(25,20) = 35 */ - /* UDPLITE: 14+max(25,20) = 39 */ - /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ - /* ESP: 10+max(25)+15 = 50 */ - /* AH: 9+max(25)+15 = 49 */ - /* unknown: 10 */ - - /* (ICMP allows recursion one level deep) */ - /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ - /* maxlen = 230+ 91 + 230 + 252 = 803 */ -} - -static void dump_ipv4_mac_header(struct sbuff *m, - const struct nf_loginfo *info, - const struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - unsigned int logflags = 0; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - - if (!(logflags & XT_LOG_MACDECODE)) - goto fallback; - - switch (dev->type) { - case ARPHRD_ETHER: - sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, - ntohs(eth_hdr(skb)->h_proto)); - return; - default: - break; - } - -fallback: - sb_add(m, "MAC="); - if (dev->hard_header_len && - skb->mac_header != skb->network_header) { - const unsigned char *p = skb_mac_header(skb); - unsigned int i; - - sb_add(m, "%02x", *p++); - for (i = 1; i < dev->hard_header_len; i++, p++) - sb_add(m, ":%02x", *p); - } - sb_add(m, " "); -} - -static void -log_packet_common(struct sbuff *m, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - sb_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", - '0' + loginfo->u.log.level, prefix, - in ? in->name : "", - out ? out->name : ""); -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - const struct net_device *physindev; - const struct net_device *physoutdev; - - physindev = skb->nf_bridge->physindev; - if (physindev && in != physindev) - sb_add(m, "PHYSIN=%s ", physindev->name); - physoutdev = skb->nf_bridge->physoutdev; - if (physoutdev && out != physoutdev) - sb_add(m, "PHYSOUT=%s ", physoutdev->name); - } -#endif -} - - -static void -ipt_log_packet(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - struct sbuff *m; - - /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net)) - return; - - m = sb_open(); - - if (!loginfo) - loginfo = &default_loginfo; - - log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); - - if (in != NULL) - dump_ipv4_mac_header(m, loginfo, skb); - - dump_ipv4_packet(m, loginfo, skb, 0); - - sb_close(m); -} - -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) -/* One level of recursion won't kill us */ -static void dump_ipv6_packet(struct sbuff *m, - const struct nf_loginfo *info, - const struct sk_buff *skb, unsigned int ip6hoff, - int recurse) -{ - u_int8_t currenthdr; - int fragment; - struct ipv6hdr _ip6h; - const struct ipv6hdr *ih; - unsigned int ptr; - unsigned int hdrlen = 0; - unsigned int logflags; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - else - logflags = NF_LOG_MASK; - - ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); - if (ih == NULL) { - sb_add(m, "TRUNCATED"); - return; - } - - /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ - sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); - - /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ - sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", - ntohs(ih->payload_len) + sizeof(struct ipv6hdr), - (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, - ih->hop_limit, - (ntohl(*(__be32 *)ih) & 0x000fffff)); - - fragment = 0; - ptr = ip6hoff + sizeof(struct ipv6hdr); - currenthdr = ih->nexthdr; - while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { - struct ipv6_opt_hdr _hdr; - const struct ipv6_opt_hdr *hp; - - hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); - if (hp == NULL) { - sb_add(m, "TRUNCATED"); - return; - } - - /* Max length: 48 "OPT (...) " */ - if (logflags & XT_LOG_IPOPT) - sb_add(m, "OPT ( "); - - switch (currenthdr) { - case IPPROTO_FRAGMENT: { - struct frag_hdr _fhdr; - const struct frag_hdr *fh; - - sb_add(m, "FRAG:"); - fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), - &_fhdr); - if (fh == NULL) { - sb_add(m, "TRUNCATED "); - return; - } - - /* Max length: 6 "65535 " */ - sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); - - /* Max length: 11 "INCOMPLETE " */ - if (fh->frag_off & htons(0x0001)) - sb_add(m, "INCOMPLETE "); - - sb_add(m, "ID:%08x ", ntohl(fh->identification)); - - if (ntohs(fh->frag_off) & 0xFFF8) - fragment = 1; - - hdrlen = 8; - - break; - } - case IPPROTO_DSTOPTS: - case IPPROTO_ROUTING: - case IPPROTO_HOPOPTS: - if (fragment) { - if (logflags & XT_LOG_IPOPT) - sb_add(m, ")"); - return; - } - hdrlen = ipv6_optlen(hp); - break; - /* Max Length */ - case IPPROTO_AH: - if (logflags & XT_LOG_IPOPT) { - struct ip_auth_hdr _ahdr; - const struct ip_auth_hdr *ah; - - /* Max length: 3 "AH " */ - sb_add(m, "AH "); - - if (fragment) { - sb_add(m, ")"); - return; - } - - ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), - &_ahdr); - if (ah == NULL) { - /* - * Max length: 26 "INCOMPLETE [65535 - * bytes] )" - */ - sb_add(m, "INCOMPLETE [%u bytes] )", - skb->len - ptr); - return; - } - - /* Length: 15 "SPI=0xF1234567 */ - sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); - - } - - hdrlen = (hp->hdrlen+2)<<2; - break; - case IPPROTO_ESP: - if (logflags & XT_LOG_IPOPT) { - struct ip_esp_hdr _esph; - const struct ip_esp_hdr *eh; - - /* Max length: 4 "ESP " */ - sb_add(m, "ESP "); - - if (fragment) { - sb_add(m, ")"); - return; - } - - /* - * Max length: 26 "INCOMPLETE [65535 bytes] )" - */ - eh = skb_header_pointer(skb, ptr, sizeof(_esph), - &_esph); - if (eh == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] )", - skb->len - ptr); - return; - } - - /* Length: 16 "SPI=0xF1234567 )" */ - sb_add(m, "SPI=0x%x )", ntohl(eh->spi)); - - } - return; - default: - /* Max length: 20 "Unknown Ext Hdr 255" */ - sb_add(m, "Unknown Ext Hdr %u", currenthdr); - return; - } - if (logflags & XT_LOG_IPOPT) - sb_add(m, ") "); - - currenthdr = hp->nexthdr; - ptr += hdrlen; - } - - switch (currenthdr) { - case IPPROTO_TCP: - if (dump_tcp_header(m, skb, currenthdr, fragment, ptr, - logflags)) - return; - break; - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - if (dump_udp_header(m, skb, currenthdr, fragment, ptr)) - return; - break; - case IPPROTO_ICMPV6: { - struct icmp6hdr _icmp6h; - const struct icmp6hdr *ic; - - /* Max length: 13 "PROTO=ICMPv6 " */ - sb_add(m, "PROTO=ICMPv6 "); - - if (fragment) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); - if (ic == NULL) { - sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); - return; - } - - /* Max length: 18 "TYPE=255 CODE=255 " */ - sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); - - switch (ic->icmp6_type) { - case ICMPV6_ECHO_REQUEST: - case ICMPV6_ECHO_REPLY: - /* Max length: 19 "ID=65535 SEQ=65535 " */ - sb_add(m, "ID=%u SEQ=%u ", - ntohs(ic->icmp6_identifier), - ntohs(ic->icmp6_sequence)); - break; - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - break; - - case ICMPV6_PARAMPROB: - /* Max length: 17 "POINTER=ffffffff " */ - sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer)); - /* Fall through */ - case ICMPV6_DEST_UNREACH: - case ICMPV6_PKT_TOOBIG: - case ICMPV6_TIME_EXCEED: - /* Max length: 3+maxlen */ - if (recurse) { - sb_add(m, "["); - dump_ipv6_packet(m, info, skb, - ptr + sizeof(_icmp6h), 0); - sb_add(m, "] "); - } - - /* Max length: 10 "MTU=65535 " */ - if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) - sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu)); - } - break; - } - /* Max length: 10 "PROTO=255 " */ - default: - sb_add(m, "PROTO=%u ", currenthdr); - } - - /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && recurse) - dump_sk_uid_gid(m, skb->sk); - - /* Max length: 16 "MARK=0xFFFFFFFF " */ - if (recurse && skb->mark) - sb_add(m, "MARK=0x%x ", skb->mark); -} - -static void dump_ipv6_mac_header(struct sbuff *m, - const struct nf_loginfo *info, - const struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - unsigned int logflags = 0; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - - if (!(logflags & XT_LOG_MACDECODE)) - goto fallback; - - switch (dev->type) { - case ARPHRD_ETHER: - sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, - ntohs(eth_hdr(skb)->h_proto)); - return; - default: - break; - } - -fallback: - sb_add(m, "MAC="); - if (dev->hard_header_len && - skb->mac_header != skb->network_header) { - const unsigned char *p = skb_mac_header(skb); - unsigned int len = dev->hard_header_len; - unsigned int i; - - if (dev->type == ARPHRD_SIT) { - p -= ETH_HLEN; - - if (p < skb->head) - p = NULL; - } - - if (p != NULL) { - sb_add(m, "%02x", *p++); - for (i = 1; i < len; i++) - sb_add(m, ":%02x", *p++); - } - sb_add(m, " "); - - if (dev->type == ARPHRD_SIT) { - const struct iphdr *iph = - (struct iphdr *)skb_mac_header(skb); - sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, - &iph->daddr); - } - } else - sb_add(m, " "); -} - -static void -ip6t_log_packet(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - struct sbuff *m; - - /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net)) - return; - - m = sb_open(); - - if (!loginfo) - loginfo = &default_loginfo; - - log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); - - if (in != NULL) - dump_ipv6_mac_header(m, loginfo, skb); - - dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); - - sb_close(m); -} -#endif static unsigned int log_tg(struct sk_buff *skb, const struct xt_action_param *par) @@ -839,17 +39,8 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; - if (par->family == NFPROTO_IPV4) - ipt_log_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in, - par->out, &li, loginfo->prefix); -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - else if (par->family == NFPROTO_IPV6) - ip6t_log_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in, - par->out, &li, loginfo->prefix); -#endif - else - WARN_ON_ONCE(1); - + nf_log_packet(net, par->family, par->hooknum, skb, par->in, par->out, + &li, "%s", loginfo->prefix); return XT_CONTINUE; } @@ -870,7 +61,12 @@ static int log_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } - return 0; + return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); +} + +static void log_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_logger_put(par->family, NF_LOG_TYPE_LOG); } static struct xt_target log_tg_regs[] __read_mostly = { @@ -880,6 +76,7 @@ static struct xt_target log_tg_regs[] __read_mostly = { .target = log_tg, .targetsize = sizeof(struct xt_log_info), .checkentry = log_tg_check, + .destroy = log_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) @@ -889,78 +86,19 @@ static struct xt_target log_tg_regs[] __read_mostly = { .target = log_tg, .targetsize = sizeof(struct xt_log_info), .checkentry = log_tg_check, + .destroy = log_tg_destroy, .me = THIS_MODULE, }, #endif }; -static struct nf_logger ipt_log_logger __read_mostly = { - .name = "ipt_LOG", - .logfn = &ipt_log_packet, - .me = THIS_MODULE, -}; - -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) -static struct nf_logger ip6t_log_logger __read_mostly = { - .name = "ip6t_LOG", - .logfn = &ip6t_log_packet, - .me = THIS_MODULE, -}; -#endif - -static int __net_init log_net_init(struct net *net) -{ - nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger); -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger); -#endif - return 0; -} - -static void __net_exit log_net_exit(struct net *net) -{ - nf_log_unset(net, &ipt_log_logger); -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - nf_log_unset(net, &ip6t_log_logger); -#endif -} - -static struct pernet_operations log_net_ops = { - .init = log_net_init, - .exit = log_net_exit, -}; - static int __init log_tg_init(void) { - int ret; - - ret = register_pernet_subsys(&log_net_ops); - if (ret < 0) - goto err_pernet; - - ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); - if (ret < 0) - goto err_target; - - nf_log_register(NFPROTO_IPV4, &ipt_log_logger); -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - nf_log_register(NFPROTO_IPV6, &ip6t_log_logger); -#endif - return 0; - -err_target: - unregister_pernet_subsys(&log_net_ops); -err_pernet: - return ret; + return xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); } static void __exit log_tg_exit(void) { - unregister_pernet_subsys(&log_net_ops); - nf_log_unregister(&ipt_log_logger); -#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) - nf_log_unregister(&ip6t_log_logger); -#endif xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); } diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 370adf622cef..604df6fae6fc 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -136,7 +136,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; - ret = gen_new_estimator(&est->bstats, &est->rstats, + ret = gen_new_estimator(&est->bstats, NULL, &est->rstats, &est->lock, &cfg.opt); if (ret < 0) goto err2; diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index bbffdbdaf603..dffee9d47ec4 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -28,7 +28,7 @@ static int bpf_mt_check(const struct xt_mtchk_param *par) program.len = info->bpf_program_num_elem; program.filter = info->bpf_program; - if (sk_unattached_filter_create(&info->filter, &program)) { + if (bpf_prog_create(&info->filter, &program)) { pr_info("bpf: check failed: parse error\n"); return -EINVAL; } @@ -40,13 +40,13 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_bpf_info *info = par->matchinfo; - return SK_RUN_FILTER(info->filter, skb); + return BPF_PROG_RUN(info->filter, skb); } static void bpf_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_bpf_info *info = par->matchinfo; - sk_unattached_filter_destroy(info->filter); + bpf_prog_destroy(info->filter); } static struct xt_match bpf_mt_reg __read_mostly = { diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index f4e833005320..7198d660b4de 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -31,7 +31,7 @@ static int cgroup_mt_check(const struct xt_mtchk_param *par) if (info->invert & ~1) return -EINVAL; - return info->id ? 0 : -EINVAL; + return 0; } static bool diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index f4af1bfafb1c..96fa26b20b67 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -55,7 +55,8 @@ xt_cluster_hash(const struct nf_conn *ct, WARN_ON(1); break; } - return (((u64)hash * info->total_nodes) >> 32); + + return reciprocal_scale(hash, info->total_nodes); } static inline bool diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 1e634615ab9d..d4bec261e74e 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -120,7 +120,7 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par) * accounting is enabled, so complain in the hope that someone notices. */ if (!nf_ct_acct_enabled(par->net)) { - pr_warning("Forcing CT accounting to be enabled\n"); + pr_warn("Forcing CT accounting to be enabled\n"); nf_ct_set_acct(par->net, true); } diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index a3910fc2122b..05fbc2a0be46 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -104,7 +104,7 @@ struct xt_hashlimit_htable { spinlock_t lock; /* lock for list_head */ u_int32_t rnd; /* random seed for hash */ unsigned int count; /* number entries in table */ - struct timer_list timer; /* timer for gc */ + struct delayed_work gc_work; /* seq_file stuff */ struct proc_dir_entry *pde; @@ -135,7 +135,7 @@ hash_dst(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst) * give results between [0 and cfg.size-1] and same hash distribution, * but using a multiply, less expensive than a divide */ - return ((u64)hash * ht->cfg.size) >> 32; + return reciprocal_scale(hash, ht->cfg.size); } static struct dsthash_ent * @@ -213,7 +213,7 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) call_rcu_bh(&ent->rcu, dsthash_free_rcu); ht->count--; } -static void htable_gc(unsigned long htlong); +static void htable_gc(struct work_struct *work); static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, u_int8_t family) @@ -273,9 +273,9 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, } hinfo->net = net; - setup_timer(&hinfo->timer, htable_gc, (unsigned long)hinfo); - hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval); - add_timer(&hinfo->timer); + INIT_DEFERRABLE_WORK(&hinfo->gc_work, htable_gc); + queue_delayed_work(system_power_efficient_wq, &hinfo->gc_work, + msecs_to_jiffies(hinfo->cfg.gc_interval)); hlist_add_head(&hinfo->node, &hashlimit_net->htables); @@ -300,29 +300,30 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, { unsigned int i; - /* lock hash table and iterate over it */ - spin_lock_bh(&ht->lock); for (i = 0; i < ht->cfg.size; i++) { struct dsthash_ent *dh; struct hlist_node *n; + + spin_lock_bh(&ht->lock); hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { if ((*select)(ht, dh)) dsthash_free(ht, dh); } + spin_unlock_bh(&ht->lock); + cond_resched(); } - spin_unlock_bh(&ht->lock); } -/* hash table garbage collector, run by timer */ -static void htable_gc(unsigned long htlong) +static void htable_gc(struct work_struct *work) { - struct xt_hashlimit_htable *ht = (struct xt_hashlimit_htable *)htlong; + struct xt_hashlimit_htable *ht; + + ht = container_of(work, struct xt_hashlimit_htable, gc_work.work); htable_selective_cleanup(ht, select_gc); - /* re-add the timer accordingly */ - ht->timer.expires = jiffies + msecs_to_jiffies(ht->cfg.gc_interval); - add_timer(&ht->timer); + queue_delayed_work(system_power_efficient_wq, + &ht->gc_work, msecs_to_jiffies(ht->cfg.gc_interval)); } static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo) @@ -341,7 +342,7 @@ static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo) static void htable_destroy(struct xt_hashlimit_htable *hinfo) { - del_timer_sync(&hinfo->timer); + cancel_delayed_work_sync(&hinfo->gc_work); htable_remove_proc_entry(hinfo); htable_selective_cleanup(hinfo, select_all); kfree(hinfo->name); @@ -942,7 +943,7 @@ static int __init hashlimit_mt_init(void) sizeof(struct dsthash_ent), 0, 0, NULL); if (!hashlimit_cachep) { - pr_warning("unable to create slab cache\n"); + pr_warn("unable to create slab cache\n"); goto err2; } return 0; diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index d7ca16b8b8df..f440f57a452f 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -13,6 +13,7 @@ #include <linux/netfilter_bridge.h> #include <linux/netfilter/xt_physdev.h> #include <linux/netfilter/x_tables.h> +#include <net/netfilter/br_netfilter.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); @@ -87,6 +88,8 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) { const struct xt_physdev_info *info = par->matchinfo; + br_netfilter_enable(); + if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || info->bitmask & ~XT_PHYSDEV_OP_MASK) return -EINVAL; diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 80c2e2d603e0..5732cd64acc0 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -84,13 +84,12 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par) index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find set identified by id %u to match\n", - info->match_set.index); + pr_warn("Cannot find set identified by id %u to match\n", + info->match_set.index); return -ENOENT; } if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) { - pr_warning("Protocol error: set match dimension " - "is over the limit!\n"); + pr_warn("Protocol error: set match dimension is over the limit!\n"); ip_set_nfnl_put(par->net, info->match_set.index); return -ERANGE; } @@ -134,13 +133,12 @@ set_match_v1_checkentry(const struct xt_mtchk_param *par) index = ip_set_nfnl_get_byindex(par->net, info->match_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find set identified by id %u to match\n", - info->match_set.index); + pr_warn("Cannot find set identified by id %u to match\n", + info->match_set.index); return -ENOENT; } if (info->match_set.dim > IPSET_DIM_MAX) { - pr_warning("Protocol error: set match dimension " - "is over the limit!\n"); + pr_warn("Protocol error: set match dimension is over the limit!\n"); ip_set_nfnl_put(par->net, info->match_set.index); return -ERANGE; } @@ -230,8 +228,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) if (info->add_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find add_set index %u as target\n", - info->add_set.index); + pr_warn("Cannot find add_set index %u as target\n", + info->add_set.index); return -ENOENT; } } @@ -239,8 +237,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) if (info->del_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find del_set index %u as target\n", - info->del_set.index); + pr_warn("Cannot find del_set index %u as target\n", + info->del_set.index); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); return -ENOENT; @@ -248,8 +246,7 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par) } if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 || info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) { - pr_warning("Protocol error: SET target dimension " - "is over the limit!\n"); + pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) @@ -303,8 +300,8 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) if (info->add_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->add_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find add_set index %u as target\n", - info->add_set.index); + pr_warn("Cannot find add_set index %u as target\n", + info->add_set.index); return -ENOENT; } } @@ -312,8 +309,8 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) if (info->del_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, info->del_set.index); if (index == IPSET_INVALID_ID) { - pr_warning("Cannot find del_set index %u as target\n", - info->del_set.index); + pr_warn("Cannot find del_set index %u as target\n", + info->del_set.index); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); return -ENOENT; @@ -321,8 +318,7 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par) } if (info->add_set.dim > IPSET_DIM_MAX || info->del_set.dim > IPSET_DIM_MAX) { - pr_warning("Protocol error: SET target dimension " - "is over the limit!\n"); + pr_warn("Protocol error: SET target dimension is over the limit!\n"); if (info->add_set.index != IPSET_INVALID_ID) ip_set_nfnl_put(par->net, info->add_set.index); if (info->del_set.index != IPSET_INVALID_ID) @@ -370,6 +366,140 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) #define set_target_v2_checkentry set_target_v1_checkentry #define set_target_v2_destroy set_target_v1_destroy +/* Revision 3 target */ + +static unsigned int +set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, info->flags, info->timeout); + ADT_OPT(del_opt, par->family, info->del_set.dim, + info->del_set.flags, 0, UINT_MAX); + ADT_OPT(map_opt, par->family, info->map_set.dim, + info->map_set.flags, 0, UINT_MAX); + + int ret; + + /* Normalize to fit into jiffies */ + if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && + add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par, &add_opt); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par, &del_opt); + if (info->map_set.index != IPSET_INVALID_ID) { + map_opt.cmdflags |= info->flags & (IPSET_FLAG_MAP_SKBMARK | + IPSET_FLAG_MAP_SKBPRIO | + IPSET_FLAG_MAP_SKBQUEUE); + ret = match_set(info->map_set.index, skb, par, &map_opt, + info->map_set.flags & IPSET_INV_MATCH); + if (!ret) + return XT_CONTINUE; + if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBMARK) + skb->mark = (skb->mark & ~(map_opt.ext.skbmarkmask)) + ^ (map_opt.ext.skbmark); + if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBPRIO) + skb->priority = map_opt.ext.skbprio; + if ((map_opt.cmdflags & IPSET_FLAG_MAP_SKBQUEUE) && + skb->dev && + skb->dev->real_num_tx_queues > map_opt.ext.skbqueue) + skb_set_queue_mapping(skb, map_opt.ext.skbqueue); + } + return XT_CONTINUE; +} + + +static int +set_target_v3_checkentry(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + ip_set_id_t index; + + if (info->add_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, + info->add_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find add_set index %u as target\n", + info->add_set.index); + return -ENOENT; + } + } + + if (info->del_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, + info->del_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find del_set index %u as target\n", + info->del_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->add_set.index); + return -ENOENT; + } + } + + if (info->map_set.index != IPSET_INVALID_ID) { + if (strncmp(par->table, "mangle", 7)) { + pr_warn("--map-set only usable from mangle table\n"); + return -EINVAL; + } + if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | + (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && + !(par->hook_mask & (1 << NF_INET_FORWARD | + 1 << NF_INET_LOCAL_OUT | + 1 << NF_INET_POST_ROUTING))) { + pr_warn("mapping of prio or/and queue is allowed only" + "from OUTPUT/FORWARD/POSTROUTING chains\n"); + return -EINVAL; + } + index = ip_set_nfnl_get_byindex(par->net, + info->map_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find map_set index %u as target\n", + info->map_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->del_set.index); + return -ENOENT; + } + } + + if (info->add_set.dim > IPSET_DIM_MAX || + info->del_set.dim > IPSET_DIM_MAX || + info->map_set.dim > IPSET_DIM_MAX) { + pr_warn("Protocol error: SET target dimension " + "is over the limit!\n"); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_target_v3_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); +} + + static struct xt_match set_matches[] __read_mostly = { { .name = "set", @@ -497,6 +627,27 @@ static struct xt_target set_targets[] __read_mostly = { .destroy = set_target_v2_destroy, .me = THIS_MODULE }, + /* --map-set support */ + { + .name = "SET", + .revision = 3, + .family = NFPROTO_IPV4, + .target = set_target_v3, + .targetsize = sizeof(struct xt_set_info_target_v3), + .checkentry = set_target_v3_checkentry, + .destroy = set_target_v3_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 3, + .family = NFPROTO_IPV6, + .target = set_target_v3, + .targetsize = sizeof(struct xt_set_info_target_v3), + .checkentry = set_target_v3_checkentry, + .destroy = set_target_v3_destroy, + .me = THIS_MODULE + }, }; static int __init xt_set_init(void) diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index d3c48b14ab94..5699adb97652 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -29,7 +29,6 @@ string_mt(const struct sk_buff *skb, struct xt_action_param *par) struct ts_state state; bool invert; - memset(&state, 0, sizeof(struct ts_state)); invert = conf->u.v1.flags & XT_STRING_FLAG_INVERT; return (skb_find_text((struct sk_buff *)skb, conf->from_offset, diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 05ea4a4cc0ac..a845cd4cf21e 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -170,7 +170,6 @@ int netlbl_cfg_unlbl_map_add(const char *domain, #endif /* IPv6 */ default: goto cfg_unlbl_map_add_failure; - break; } entry->def.addrsel = addrmap; @@ -247,7 +246,6 @@ int netlbl_cfg_unlbl_static_add(struct net *net, * @addr: IP address in network byte order (struct in[6]_addr) * @mask: address mask in network byte order (struct in[6]_addr) * @family: address family - * @secid: LSM secid value for the entry * @audit_info: NetLabel audit information * * Description: diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 1e779bb7fa43..adf8b7900da2 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -71,11 +71,7 @@ int __init netlbl_netlink_init(void) if (ret_val != 0) return ret_val; - ret_val = netlbl_unlabel_genl_init(); - if (ret_val != 0) - return ret_val; - - return 0; + return netlbl_unlabel_genl_init(); } /* diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e6fac7e3db52..c416725d28c4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -58,7 +58,9 @@ #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> +#include <linux/rhashtable.h> #include <asm/cacheflush.h> +#include <linux/hash.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -100,6 +102,19 @@ static atomic_t nl_table_users = ATOMIC_INIT(0); #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); +/* Protects netlink socket hash table mutations */ +DEFINE_MUTEX(nl_sk_hash_lock); +EXPORT_SYMBOL_GPL(nl_sk_hash_lock); + +static int lockdep_nl_sk_hash_is_held(void) +{ +#ifdef CONFIG_LOCKDEP + return (debug_locks) ? lockdep_is_held(&nl_sk_hash_lock) : 1; +#else + return 1; +#endif +} + static ATOMIC_NOTIFIER_HEAD(netlink_chain); static DEFINE_SPINLOCK(netlink_tap_lock); @@ -110,11 +125,6 @@ static inline u32 netlink_group_mask(u32 group) return group ? 1 << (group - 1) : 0; } -static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u32 portid) -{ - return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; -} - int netlink_add_tap(struct netlink_tap *nt) { if (unlikely(nt->dev->type != ARPHRD_NETLINK)) @@ -170,7 +180,6 @@ EXPORT_SYMBOL_GPL(netlink_remove_tap); static bool netlink_filter_tap(const struct sk_buff *skb) { struct sock *sk = skb->sk; - bool pass = false; /* We take the more conservative approach and * whitelist socket protocols that may pass. @@ -184,11 +193,10 @@ static bool netlink_filter_tap(const struct sk_buff *skb) case NETLINK_FIB_LOOKUP: case NETLINK_NETFILTER: case NETLINK_GENERIC: - pass = true; - break; + return true; } - return pass; + return false; } static int __netlink_deliver_tap_skb(struct sk_buff *skb, @@ -205,7 +213,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, nskb->protocol = htons((u16) sk->sk_protocol); nskb->pkt_type = netlink_is_kernel(sk) ? PACKET_KERNEL : PACKET_USER; - + skb_reset_network_header(nskb); ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) ret = net_xmit_errno(ret); @@ -376,7 +384,7 @@ static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, if ((int)req->nm_block_size <= 0) return -EINVAL; - if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) + if (!PAGE_ALIGNED(req->nm_block_size)) return -EINVAL; if (req->nm_frame_size < NL_MMAP_HDRLEN) return -EINVAL; @@ -985,105 +993,48 @@ netlink_unlock_table(void) wake_up(&nl_table_wait); } -static bool netlink_compare(struct net *net, struct sock *sk) +struct netlink_compare_arg { - return net_eq(sock_net(sk), net); -} - -static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) -{ - struct netlink_table *table = &nl_table[protocol]; - struct nl_portid_hash *hash = &table->hash; - struct hlist_head *head; - struct sock *sk; - - read_lock(&nl_table_lock); - head = nl_portid_hashfn(hash, portid); - sk_for_each(sk, head) { - if (table->compare(net, sk) && - (nlk_sk(sk)->portid == portid)) { - sock_hold(sk); - goto found; - } - } - sk = NULL; -found: - read_unlock(&nl_table_lock); - return sk; -} + struct net *net; + u32 portid; +}; -static struct hlist_head *nl_portid_hash_zalloc(size_t size) +static bool netlink_compare(void *ptr, void *arg) { - if (size <= PAGE_SIZE) - return kzalloc(size, GFP_ATOMIC); - else - return (struct hlist_head *) - __get_free_pages(GFP_ATOMIC | __GFP_ZERO, - get_order(size)); -} + struct netlink_compare_arg *x = arg; + struct sock *sk = ptr; -static void nl_portid_hash_free(struct hlist_head *table, size_t size) -{ - if (size <= PAGE_SIZE) - kfree(table); - else - free_pages((unsigned long)table, get_order(size)); + return nlk_sk(sk)->portid == x->portid && + net_eq(sock_net(sk), x->net); } -static int nl_portid_hash_rehash(struct nl_portid_hash *hash, int grow) +static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid, + struct net *net) { - unsigned int omask, mask, shift; - size_t osize, size; - struct hlist_head *otable, *table; - int i; - - omask = mask = hash->mask; - osize = size = (mask + 1) * sizeof(*table); - shift = hash->shift; - - if (grow) { - if (++shift > hash->max_shift) - return 0; - mask = mask * 2 + 1; - size *= 2; - } - - table = nl_portid_hash_zalloc(size); - if (!table) - return 0; - - otable = hash->table; - hash->table = table; - hash->mask = mask; - hash->shift = shift; - get_random_bytes(&hash->rnd, sizeof(hash->rnd)); + struct netlink_compare_arg arg = { + .net = net, + .portid = portid, + }; + u32 hash; - for (i = 0; i <= omask; i++) { - struct sock *sk; - struct hlist_node *tmp; - - sk_for_each_safe(sk, tmp, &otable[i]) - __sk_add_node(sk, nl_portid_hashfn(hash, nlk_sk(sk)->portid)); - } + hash = rhashtable_hashfn(&table->hash, &portid, sizeof(portid)); - nl_portid_hash_free(otable, osize); - hash->rehash_time = jiffies + 10 * 60 * HZ; - return 1; + return rhashtable_lookup_compare(&table->hash, hash, + &netlink_compare, &arg); } -static inline int nl_portid_hash_dilute(struct nl_portid_hash *hash, int len) +static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { - int avg = hash->entries >> hash->shift; - - if (unlikely(avg > 1) && nl_portid_hash_rehash(hash, 1)) - return 1; + struct netlink_table *table = &nl_table[protocol]; + struct sock *sk; - if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) { - nl_portid_hash_rehash(hash, 0); - return 1; - } + rcu_read_lock(); + sk = __netlink_lookup(table, portid, net); + if (sk) + sock_hold(sk); + rcu_read_unlock(); - return 0; + return sk; } static const struct proto_ops netlink_ops; @@ -1115,22 +1066,10 @@ netlink_update_listeners(struct sock *sk) static int netlink_insert(struct sock *sk, struct net *net, u32 portid) { struct netlink_table *table = &nl_table[sk->sk_protocol]; - struct nl_portid_hash *hash = &table->hash; - struct hlist_head *head; int err = -EADDRINUSE; - struct sock *osk; - int len; - netlink_table_grab(); - head = nl_portid_hashfn(hash, portid); - len = 0; - sk_for_each(osk, head) { - if (table->compare(net, osk) && - (nlk_sk(osk)->portid == portid)) - break; - len++; - } - if (osk) + mutex_lock(&nl_sk_hash_lock); + if (__netlink_lookup(table, portid, net)) goto err; err = -EBUSY; @@ -1138,26 +1077,31 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid) goto err; err = -ENOMEM; - if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX)) + if (BITS_PER_LONG > 32 && unlikely(table->hash.nelems >= UINT_MAX)) goto err; - if (len && nl_portid_hash_dilute(hash, len)) - head = nl_portid_hashfn(hash, portid); - hash->entries++; nlk_sk(sk)->portid = portid; - sk_add_node(sk, head); + sock_hold(sk); + rhashtable_insert(&table->hash, &nlk_sk(sk)->node, GFP_KERNEL); err = 0; - err: - netlink_table_ungrab(); + mutex_unlock(&nl_sk_hash_lock); return err; } static void netlink_remove(struct sock *sk) { + struct netlink_table *table; + + mutex_lock(&nl_sk_hash_lock); + table = &nl_table[sk->sk_protocol]; + if (rhashtable_remove(&table->hash, &nlk_sk(sk)->node, GFP_KERNEL)) { + WARN_ON(atomic_read(&sk->sk_refcnt) == 1); + __sock_put(sk); + } + mutex_unlock(&nl_sk_hash_lock); + netlink_table_grab(); - if (sk_del_node_init(sk)) - nl_table[sk->sk_protocol].hash.entries--; if (nlk_sk(sk)->subscriptions) __sk_del_bind_node(sk); netlink_table_ungrab(); @@ -1313,6 +1257,9 @@ static int netlink_release(struct socket *sock) } netlink_table_ungrab(); + /* Wait for readers to complete */ + synchronize_net(); + kfree(nlk->groups); nlk->groups = NULL; @@ -1328,30 +1275,22 @@ static int netlink_autobind(struct socket *sock) struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_table *table = &nl_table[sk->sk_protocol]; - struct nl_portid_hash *hash = &table->hash; - struct hlist_head *head; - struct sock *osk; s32 portid = task_tgid_vnr(current); int err; static s32 rover = -4097; retry: cond_resched(); - netlink_table_grab(); - head = nl_portid_hashfn(hash, portid); - sk_for_each(osk, head) { - if (!table->compare(net, osk)) - continue; - if (nlk_sk(osk)->portid == portid) { - /* Bind collision, search negative portid values. */ - portid = rover--; - if (rover > -4097) - rover = -4097; - netlink_table_ungrab(); - goto retry; - } + rcu_read_lock(); + if (__netlink_lookup(table, portid, net)) { + /* Bind collision, search negative portid values. */ + portid = rover--; + if (rover > -4097) + rover = -4097; + rcu_read_unlock(); + goto retry; } - netlink_table_ungrab(); + rcu_read_unlock(); err = netlink_insert(sk, net, portid); if (err == -EADDRINUSE) @@ -1961,25 +1900,25 @@ struct netlink_broadcast_data { void *tx_data; }; -static int do_one_broadcast(struct sock *sk, - struct netlink_broadcast_data *p) +static void do_one_broadcast(struct sock *sk, + struct netlink_broadcast_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int val; if (p->exclude_sk == sk) - goto out; + return; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) - goto out; + return; if (!net_eq(sock_net(sk), p->net)) - goto out; + return; if (p->failure) { netlink_overrun(sk); - goto out; + return; } sock_hold(sk); @@ -2017,9 +1956,6 @@ static int do_one_broadcast(struct sock *sk, p->skb2 = NULL; } sock_put(sk); - -out: - return 0; } int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, @@ -2958,14 +2894,18 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) { struct nl_seq_iter *iter = seq->private; int i, j; + struct netlink_sock *nlk; struct sock *s; loff_t off = 0; for (i = 0; i < MAX_LINKS; i++) { - struct nl_portid_hash *hash = &nl_table[i].hash; + struct rhashtable *ht = &nl_table[i].hash; + const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); + + for (j = 0; j < tbl->size; j++) { + rht_for_each_entry_rcu(nlk, tbl->buckets[j], node) { + s = (struct sock *)nlk; - for (j = 0; j <= hash->mask; j++) { - sk_for_each(s, &hash->table[j]) { if (sock_net(s) != seq_file_net(seq)) continue; if (off == pos) { @@ -2981,15 +2921,15 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) } static void *netlink_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(nl_table_lock) + __acquires(RCU) { - read_lock(&nl_table_lock); + rcu_read_lock(); return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct sock *s; + struct netlink_sock *nlk; struct nl_seq_iter *iter; struct net *net; int i, j; @@ -3001,28 +2941,26 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) net = seq_file_net(seq); iter = seq->private; - s = v; - do { - s = sk_next(s); - } while (s && !nl_table[s->sk_protocol].compare(net, s)); - if (s) - return s; + nlk = v; + + rht_for_each_entry_rcu(nlk, nlk->node.next, node) + if (net_eq(sock_net((struct sock *)nlk), net)) + return nlk; i = iter->link; j = iter->hash_idx + 1; do { - struct nl_portid_hash *hash = &nl_table[i].hash; - - for (; j <= hash->mask; j++) { - s = sk_head(&hash->table[j]); + struct rhashtable *ht = &nl_table[i].hash; + const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); - while (s && !nl_table[s->sk_protocol].compare(net, s)) - s = sk_next(s); - if (s) { - iter->link = i; - iter->hash_idx = j; - return s; + for (; j < tbl->size; j++) { + rht_for_each_entry_rcu(nlk, tbl->buckets[j], node) { + if (net_eq(sock_net((struct sock *)nlk), net)) { + iter->link = i; + iter->hash_idx = j; + return nlk; + } } } @@ -3033,9 +2971,9 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void netlink_seq_stop(struct seq_file *seq, void *v) - __releases(nl_table_lock) + __releases(RCU) { - read_unlock(&nl_table_lock); + rcu_read_unlock(); } @@ -3173,9 +3111,17 @@ static struct pernet_operations __net_initdata netlink_net_ops = { static int __init netlink_proto_init(void) { int i; - unsigned long limit; - unsigned int order; int err = proto_register(&netlink_proto, 0); + struct rhashtable_params ht_params = { + .head_offset = offsetof(struct netlink_sock, node), + .key_offset = offsetof(struct netlink_sock, portid), + .key_len = sizeof(u32), /* portid */ + .hashfn = arch_fast_hash, + .max_shift = 16, /* 64K */ + .grow_decision = rht_grow_above_75, + .shrink_decision = rht_shrink_below_30, + .mutex_is_held = lockdep_nl_sk_hash_is_held, + }; if (err != 0) goto out; @@ -3186,32 +3132,13 @@ static int __init netlink_proto_init(void) if (!nl_table) goto panic; - if (totalram_pages >= (128 * 1024)) - limit = totalram_pages >> (21 - PAGE_SHIFT); - else - limit = totalram_pages >> (23 - PAGE_SHIFT); - - order = get_bitmask_order(limit) - 1 + PAGE_SHIFT; - limit = (1UL << order) / sizeof(struct hlist_head); - order = get_bitmask_order(min(limit, (unsigned long)UINT_MAX)) - 1; - for (i = 0; i < MAX_LINKS; i++) { - struct nl_portid_hash *hash = &nl_table[i].hash; - - hash->table = nl_portid_hash_zalloc(1 * sizeof(*hash->table)); - if (!hash->table) { - while (i-- > 0) - nl_portid_hash_free(nl_table[i].hash.table, - 1 * sizeof(*hash->table)); + if (rhashtable_init(&nl_table[i].hash, &ht_params) < 0) { + while (--i > 0) + rhashtable_destroy(&nl_table[i].hash); kfree(nl_table); goto panic; } - hash->max_shift = order; - hash->shift = 0; - hash->mask = 0; - hash->rehash_time = jiffies; - - nl_table[i].compare = netlink_compare; } INIT_LIST_HEAD(&netlink_tap_all); diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 0b59d441f5b6..b20a1731759b 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -1,6 +1,7 @@ #ifndef _AF_NETLINK_H #define _AF_NETLINK_H +#include <linux/rhashtable.h> #include <net/sock.h> #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) @@ -47,6 +48,8 @@ struct netlink_sock { struct netlink_ring tx_ring; atomic_t mapped; #endif /* CONFIG_NETLINK_MMAP */ + + struct rhash_head node; }; static inline struct netlink_sock *nlk_sk(struct sock *sk) @@ -54,21 +57,8 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } -struct nl_portid_hash { - struct hlist_head *table; - unsigned long rehash_time; - - unsigned int mask; - unsigned int shift; - - unsigned int entries; - unsigned int max_shift; - - u32 rnd; -}; - struct netlink_table { - struct nl_portid_hash hash; + struct rhashtable hash; struct hlist_head mc_list; struct listeners __rcu *listeners; unsigned int flags; @@ -83,5 +73,6 @@ struct netlink_table { extern struct netlink_table *nl_table; extern rwlock_t nl_table_lock; +extern struct mutex nl_sk_hash_lock; #endif diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 1af29624b92f..de8c74a3c061 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -4,6 +4,7 @@ #include <linux/netlink.h> #include <linux/sock_diag.h> #include <linux/netlink_diag.h> +#include <linux/rhashtable.h> #include "af_netlink.h" @@ -101,16 +102,20 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, int protocol, int s_num) { struct netlink_table *tbl = &nl_table[protocol]; - struct nl_portid_hash *hash = &tbl->hash; + struct rhashtable *ht = &tbl->hash; + const struct bucket_table *htbl = rht_dereference(ht->tbl, ht); struct net *net = sock_net(skb->sk); struct netlink_diag_req *req; + struct netlink_sock *nlsk; struct sock *sk; int ret = 0, num = 0, i; req = nlmsg_data(cb->nlh); - for (i = 0; i <= hash->mask; i++) { - sk_for_each(sk, &hash->table[i]) { + for (i = 0; i < htbl->size; i++) { + rht_for_each_entry(nlsk, htbl->buckets[i], ht, node) { + sk = (struct sock *)nlsk; + if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) { @@ -165,6 +170,7 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) req = nlmsg_data(cb->nlh); + mutex_lock(&nl_sk_hash_lock); read_lock(&nl_table_lock); if (req->sdiag_protocol == NDIAG_PROTO_ALL) { @@ -178,6 +184,7 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) } else { if (req->sdiag_protocol >= MAX_LINKS) { read_unlock(&nl_table_lock); + mutex_unlock(&nl_sk_hash_lock); return -ENOENT; } @@ -185,6 +192,7 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) } read_unlock(&nl_table_lock); + mutex_unlock(&nl_sk_hash_lock); return skb->len; } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index ede50d197e10..71cf1bffea06 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1418,7 +1418,7 @@ static int __init nr_proto_init(void) struct net_device *dev; sprintf(name, "nr%d", i); - dev = alloc_netdev(0, name, nr_setup); + dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, nr_setup); if (!dev) { printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n"); goto fail; diff --git a/net/nfc/digital.h b/net/nfc/digital.h index 71ad7eefddd4..3c39c72eb038 100644 --- a/net/nfc/digital.h +++ b/net/nfc/digital.h @@ -29,6 +29,7 @@ #define DIGITAL_CMD_TG_SEND 1 #define DIGITAL_CMD_TG_LISTEN 2 #define DIGITAL_CMD_TG_LISTEN_MDAA 3 +#define DIGITAL_CMD_TG_LISTEN_MD 4 #define DIGITAL_MAX_HEADER_LEN 7 #define DIGITAL_CRC_LEN 2 @@ -121,6 +122,8 @@ int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb); int digital_tg_listen_nfca(struct nfc_digital_dev *ddev, u8 rf_tech); int digital_tg_listen_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech); +void digital_tg_recv_md_req(struct nfc_digital_dev *ddev, void *arg, + struct sk_buff *resp); typedef u16 (*crc_func_t)(u16, const u8 *, size_t); diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index a6ce3c627e4e..009bcf317101 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -201,6 +201,11 @@ static void digital_wq_cmd(struct work_struct *work) digital_send_cmd_complete, cmd); break; + case DIGITAL_CMD_TG_LISTEN_MD: + rc = ddev->ops->tg_listen_md(ddev, cmd->timeout, + digital_send_cmd_complete, cmd); + break; + default: pr_err("Unknown cmd type %d\n", cmd->type); return; @@ -293,12 +298,19 @@ static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) 500, digital_tg_recv_atr_req, NULL); } +static int digital_tg_listen_md(struct nfc_digital_dev *ddev, u8 rf_tech) +{ + return digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MD, NULL, NULL, 500, + digital_tg_recv_md_req, NULL); +} + int digital_target_found(struct nfc_digital_dev *ddev, struct nfc_target *target, u8 protocol) { int rc; u8 framing; u8 rf_tech; + u8 poll_tech_count; int (*check_crc)(struct sk_buff *skb); void (*add_crc)(struct sk_buff *skb); @@ -375,12 +387,16 @@ int digital_target_found(struct nfc_digital_dev *ddev, return rc; target->supported_protocols = (1 << protocol); - rc = nfc_targets_found(ddev->nfc_dev, target, 1); - if (rc) - return rc; + poll_tech_count = ddev->poll_tech_count; ddev->poll_tech_count = 0; + rc = nfc_targets_found(ddev->nfc_dev, target, 1); + if (rc) { + ddev->poll_tech_count = poll_tech_count; + return rc; + } + return 0; } @@ -505,6 +521,9 @@ static int digital_start_poll(struct nfc_dev *nfc_dev, __u32 im_protocols, if (ddev->ops->tg_listen_mdaa) { digital_add_poll_tech(ddev, 0, digital_tg_listen_mdaa); + } else if (ddev->ops->tg_listen_md) { + digital_add_poll_tech(ddev, 0, + digital_tg_listen_md); } else { digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_106A, digital_tg_listen_nfca); @@ -732,7 +751,7 @@ struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops, if (!ops->in_configure_hw || !ops->in_send_cmd || !ops->tg_listen || !ops->tg_configure_hw || !ops->tg_send_cmd || !ops->abort_cmd || - !ops->switch_rf) + !ops->switch_rf || (ops->tg_listen_md && !ops->tg_get_rf_tech)) return NULL; ddev = kzalloc(sizeof(struct nfc_digital_dev), GFP_KERNEL); diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index 171cb9949ab5..b60aa35c074f 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -33,6 +33,8 @@ #define DIGITAL_ATR_REQ_MAX_SIZE 64 #define DIGITAL_LR_BITS_PAYLOAD_SIZE_254B 0x30 +#define DIGITAL_FSL_BITS_PAYLOAD_SIZE_254B \ + (DIGITAL_LR_BITS_PAYLOAD_SIZE_254B >> 4) #define DIGITAL_GB_BIT 0x02 #define DIGITAL_NFC_DEP_PFB_TYPE(pfb) ((pfb) & 0xE0) @@ -127,6 +129,98 @@ static int digital_skb_pull_dep_sod(struct nfc_digital_dev *ddev, return 0; } +static void digital_in_recv_psl_res(struct nfc_digital_dev *ddev, void *arg, + struct sk_buff *resp) +{ + struct nfc_target *target = arg; + struct digital_psl_res *psl_res; + int rc; + + if (IS_ERR(resp)) { + rc = PTR_ERR(resp); + resp = NULL; + goto exit; + } + + rc = ddev->skb_check_crc(resp); + if (rc) { + PROTOCOL_ERR("14.4.1.6"); + goto exit; + } + + rc = digital_skb_pull_dep_sod(ddev, resp); + if (rc) { + PROTOCOL_ERR("14.4.1.2"); + goto exit; + } + + psl_res = (struct digital_psl_res *)resp->data; + + if ((resp->len != sizeof(*psl_res)) || + (psl_res->dir != DIGITAL_NFC_DEP_FRAME_DIR_IN) || + (psl_res->cmd != DIGITAL_CMD_PSL_RES)) { + rc = -EIO; + goto exit; + } + + rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, + NFC_DIGITAL_RF_TECH_424F); + if (rc) + goto exit; + + rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, + NFC_DIGITAL_FRAMING_NFCF_NFC_DEP); + if (rc) + goto exit; + + if (!DIGITAL_DRV_CAPS_IN_CRC(ddev) && + (ddev->curr_rf_tech == NFC_DIGITAL_RF_TECH_106A)) { + ddev->skb_add_crc = digital_skb_add_crc_f; + ddev->skb_check_crc = digital_skb_check_crc_f; + } + + ddev->curr_rf_tech = NFC_DIGITAL_RF_TECH_424F; + + nfc_dep_link_is_up(ddev->nfc_dev, target->idx, NFC_COMM_ACTIVE, + NFC_RF_INITIATOR); + + ddev->curr_nfc_dep_pni = 0; + +exit: + dev_kfree_skb(resp); + + if (rc) + ddev->curr_protocol = 0; +} + +static int digital_in_send_psl_req(struct nfc_digital_dev *ddev, + struct nfc_target *target) +{ + struct sk_buff *skb; + struct digital_psl_req *psl_req; + + skb = digital_skb_alloc(ddev, sizeof(*psl_req)); + if (!skb) + return -ENOMEM; + + skb_put(skb, sizeof(*psl_req)); + + psl_req = (struct digital_psl_req *)skb->data; + + psl_req->dir = DIGITAL_NFC_DEP_FRAME_DIR_OUT; + psl_req->cmd = DIGITAL_CMD_PSL_REQ; + psl_req->did = 0; + psl_req->brs = (0x2 << 3) | 0x2; /* 424F both directions */ + psl_req->fsl = DIGITAL_FSL_BITS_PAYLOAD_SIZE_254B; + + digital_skb_push_dep_sod(ddev, skb); + + ddev->skb_add_crc(skb); + + return digital_in_send_cmd(ddev, skb, 500, digital_in_recv_psl_res, + target); +} + static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg, struct sk_buff *resp) { @@ -166,6 +260,13 @@ static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg, if (rc) goto exit; + if ((ddev->protocols & NFC_PROTO_FELICA_MASK) && + (ddev->curr_rf_tech != NFC_DIGITAL_RF_TECH_424F)) { + rc = digital_in_send_psl_req(ddev, target); + if (!rc) + goto exit; + } + rc = nfc_dep_link_is_up(ddev->nfc_dev, target->idx, NFC_COMM_ACTIVE, NFC_RF_INITIATOR); @@ -457,12 +558,10 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, pr_err("Received a ACK/NACK PDU\n"); rc = -EINVAL; goto exit; - break; case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU: pr_err("Received a SUPERVISOR PDU\n"); rc = -EINVAL; goto exit; - break; } skb_pull(resp, size); @@ -673,6 +772,7 @@ void digital_tg_recv_atr_req(struct nfc_digital_dev *ddev, void *arg, int rc; struct digital_atr_req *atr_req; size_t gb_len, min_size; + u8 poll_tech_count; if (IS_ERR(resp)) { rc = PTR_ERR(resp); @@ -730,12 +830,16 @@ void digital_tg_recv_atr_req(struct nfc_digital_dev *ddev, void *arg, goto exit; gb_len = resp->len - sizeof(struct digital_atr_req); + + poll_tech_count = ddev->poll_tech_count; + ddev->poll_tech_count = 0; + rc = nfc_tm_activated(ddev->nfc_dev, NFC_PROTO_NFC_DEP_MASK, NFC_COMM_PASSIVE, atr_req->gb, gb_len); - if (rc) + if (rc) { + ddev->poll_tech_count = poll_tech_count; goto exit; - - ddev->poll_tech_count = 0; + } rc = 0; exit: diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c index c2c1c0189b7c..fb58ed2dd41d 100644 --- a/net/nfc/digital_technology.c +++ b/net/nfc/digital_technology.c @@ -318,6 +318,8 @@ static void digital_in_recv_sel_res(struct nfc_digital_dev *ddev, void *arg, if (DIGITAL_SEL_RES_IS_T2T(sel_res)) { nfc_proto = NFC_PROTO_MIFARE; + } else if (DIGITAL_SEL_RES_IS_NFC_DEP(sel_res)) { + nfc_proto = NFC_PROTO_NFC_DEP; } else if (DIGITAL_SEL_RES_IS_T4T(sel_res)) { rc = digital_in_send_rats(ddev, target); if (rc) @@ -327,8 +329,6 @@ static void digital_in_recv_sel_res(struct nfc_digital_dev *ddev, void *arg, * done when receiving the ATS */ goto exit_free_skb; - } else if (DIGITAL_SEL_RES_IS_NFC_DEP(sel_res)) { - nfc_proto = NFC_PROTO_NFC_DEP; } else { rc = -EOPNOTSUPP; goto exit; @@ -944,6 +944,13 @@ static int digital_tg_send_sel_res(struct nfc_digital_dev *ddev) if (!DIGITAL_DRV_CAPS_TG_CRC(ddev)) digital_skb_add_crc_a(skb); + rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, + NFC_DIGITAL_FRAMING_NFCA_ANTICOL_COMPLETE); + if (rc) { + kfree_skb(skb); + return rc; + } + rc = digital_tg_send_cmd(ddev, skb, 300, digital_tg_recv_atr_req, NULL); if (rc) @@ -1002,6 +1009,13 @@ static int digital_tg_send_sdd_res(struct nfc_digital_dev *ddev) for (i = 0; i < 4; i++) sdd_res->bcc ^= sdd_res->nfcid1[i]; + rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, + NFC_DIGITAL_FRAMING_NFCA_STANDARD_WITH_CRC_A); + if (rc) { + kfree_skb(skb); + return rc; + } + rc = digital_tg_send_cmd(ddev, skb, 300, digital_tg_recv_sel_req, NULL); if (rc) @@ -1054,6 +1068,13 @@ static int digital_tg_send_sens_res(struct nfc_digital_dev *ddev) sens_res[0] = (DIGITAL_SENS_RES_NFC_DEP >> 8) & 0xFF; sens_res[1] = DIGITAL_SENS_RES_NFC_DEP & 0xFF; + rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, + NFC_DIGITAL_FRAMING_NFCA_STANDARD); + if (rc) { + kfree_skb(skb); + return rc; + } + rc = digital_tg_send_cmd(ddev, skb, 300, digital_tg_recv_sdd_req, NULL); if (rc) @@ -1197,33 +1218,48 @@ exit: dev_kfree_skb(resp); } -int digital_tg_listen_nfca(struct nfc_digital_dev *ddev, u8 rf_tech) +static int digital_tg_config_nfca(struct nfc_digital_dev *ddev) { int rc; - rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, rf_tech); + rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, + NFC_DIGITAL_RF_TECH_106A); if (rc) return rc; - rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, - NFC_DIGITAL_FRAMING_NFCA_NFC_DEP); + return digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, + NFC_DIGITAL_FRAMING_NFCA_NFC_DEP); +} + +int digital_tg_listen_nfca(struct nfc_digital_dev *ddev, u8 rf_tech) +{ + int rc; + + rc = digital_tg_config_nfca(ddev); if (rc) return rc; return digital_tg_listen(ddev, 300, digital_tg_recv_sens_req, NULL); } -int digital_tg_listen_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech) +static int digital_tg_config_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech) { int rc; - u8 *nfcid2; rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, rf_tech); if (rc) return rc; - rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, - NFC_DIGITAL_FRAMING_NFCF_NFC_DEP); + return digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, + NFC_DIGITAL_FRAMING_NFCF_NFC_DEP); +} + +int digital_tg_listen_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech) +{ + int rc; + u8 *nfcid2; + + rc = digital_tg_config_nfcf(ddev, rf_tech); if (rc) return rc; @@ -1237,3 +1273,43 @@ int digital_tg_listen_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech) return digital_tg_listen(ddev, 300, digital_tg_recv_sensf_req, nfcid2); } + +void digital_tg_recv_md_req(struct nfc_digital_dev *ddev, void *arg, + struct sk_buff *resp) +{ + u8 rf_tech; + int rc; + + if (IS_ERR(resp)) { + resp = NULL; + goto exit_free_skb; + } + + rc = ddev->ops->tg_get_rf_tech(ddev, &rf_tech); + if (rc) + goto exit_free_skb; + + switch (rf_tech) { + case NFC_DIGITAL_RF_TECH_106A: + rc = digital_tg_config_nfca(ddev); + if (rc) + goto exit_free_skb; + digital_tg_recv_sens_req(ddev, arg, resp); + break; + case NFC_DIGITAL_RF_TECH_212F: + case NFC_DIGITAL_RF_TECH_424F: + rc = digital_tg_config_nfcf(ddev, rf_tech); + if (rc) + goto exit_free_skb; + digital_tg_recv_sensf_req(ddev, arg, resp); + break; + default: + goto exit_free_skb; + } + + return; + +exit_free_skb: + digital_poll_next_tech(ddev); + dev_kfree_skb(resp); +} diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 47403705197e..117708263ced 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -553,8 +553,11 @@ static void hci_stop_poll(struct nfc_dev *nfc_dev) { struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev); - nfc_hci_send_event(hdev, NFC_HCI_RF_READER_A_GATE, - NFC_HCI_EVT_END_OPERATION, NULL, 0); + if (hdev->ops->stop_poll) + hdev->ops->stop_poll(hdev); + else + nfc_hci_send_event(hdev, NFC_HCI_RF_READER_A_GATE, + NFC_HCI_EVT_END_OPERATION, NULL, 0); } static int hci_dep_link_up(struct nfc_dev *nfc_dev, struct nfc_target *target, diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 2b400e1a8695..90b16cb40058 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -231,6 +231,14 @@ static void nci_rf_discover_req(struct nci_dev *ndev, unsigned long opt) cmd.num_disc_configs++; } + if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS) && + (protocols & NFC_PROTO_ISO15693_MASK)) { + cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode = + NCI_NFC_V_PASSIVE_POLL_MODE; + cmd.disc_configs[cmd.num_disc_configs].frequency = 1; + cmd.num_disc_configs++; + } + nci_send_cmd(ndev, NCI_OP_RF_DISCOVER_CMD, (1 + (cmd.num_disc_configs * sizeof(struct disc_config))), &cmd); @@ -751,10 +759,6 @@ int nci_register_device(struct nci_dev *ndev) struct device *dev = &ndev->nfc_dev->dev; char name[32]; - rc = nfc_register_device(ndev->nfc_dev); - if (rc) - goto exit; - ndev->flags = 0; INIT_WORK(&ndev->cmd_work, nci_cmd_work); @@ -762,7 +766,7 @@ int nci_register_device(struct nci_dev *ndev) ndev->cmd_wq = create_singlethread_workqueue(name); if (!ndev->cmd_wq) { rc = -ENOMEM; - goto unreg_exit; + goto exit; } INIT_WORK(&ndev->rx_work, nci_rx_work); @@ -792,6 +796,10 @@ int nci_register_device(struct nci_dev *ndev) mutex_init(&ndev->req_lock); + rc = nfc_register_device(ndev->nfc_dev); + if (rc) + goto destroy_rx_wq_exit; + goto exit; destroy_rx_wq_exit: @@ -800,9 +808,6 @@ destroy_rx_wq_exit: destroy_cmd_wq_exit: destroy_workqueue(ndev->cmd_wq); -unreg_exit: - nfc_unregister_device(ndev->nfc_dev); - exit: return rc; } diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 6c3aef852876..427ef2c7ab68 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -241,9 +241,12 @@ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb) /* strip the nci data header */ skb_pull(skb, NCI_DATA_HDR_SIZE); - if (ndev->target_active_prot == NFC_PROTO_MIFARE) { + if (ndev->target_active_prot == NFC_PROTO_MIFARE || + ndev->target_active_prot == NFC_PROTO_JEWEL || + ndev->target_active_prot == NFC_PROTO_FELICA || + ndev->target_active_prot == NFC_PROTO_ISO15693) { /* frame I/F => remove the status byte */ - pr_debug("NFC_PROTO_MIFARE => remove the status byte\n"); + pr_debug("frame I/F => remove the status byte\n"); skb_trim(skb, (skb->len - 1)); } diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index f8f6af231381..205b35f666db 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -2,6 +2,7 @@ * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * + * Copyright (C) 2014 Marvell International Ltd. * Copyright (C) 2011 Texas Instruments, Inc. * * Written by Ilan Elias <ilane@ti.com> @@ -155,6 +156,24 @@ static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, return data; } +static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcv_poll *nfcv_poll, + __u8 *data) +{ + ++data; + nfcv_poll->dsfid = *data++; + memcpy(nfcv_poll->uid, data, NFC_ISO15693_UID_MAXSIZE); + data += NFC_ISO15693_UID_MAXSIZE; + return data; +} + +__u32 nci_get_prop_rf_protocol(struct nci_dev *ndev, __u8 rf_protocol) +{ + if (ndev->ops->get_rfprotocol) + return ndev->ops->get_rfprotocol(ndev, rf_protocol); + return 0; +} + static int nci_add_new_protocol(struct nci_dev *ndev, struct nfc_target *target, __u8 rf_protocol, @@ -164,9 +183,12 @@ static int nci_add_new_protocol(struct nci_dev *ndev, struct rf_tech_specific_params_nfca_poll *nfca_poll; struct rf_tech_specific_params_nfcb_poll *nfcb_poll; struct rf_tech_specific_params_nfcf_poll *nfcf_poll; + struct rf_tech_specific_params_nfcv_poll *nfcv_poll; __u32 protocol; - if (rf_protocol == NCI_RF_PROTOCOL_T2T) + if (rf_protocol == NCI_RF_PROTOCOL_T1T) + protocol = NFC_PROTO_JEWEL_MASK; + else if (rf_protocol == NCI_RF_PROTOCOL_T2T) protocol = NFC_PROTO_MIFARE_MASK; else if (rf_protocol == NCI_RF_PROTOCOL_ISO_DEP) if (rf_tech_and_mode == NCI_NFC_A_PASSIVE_POLL_MODE) @@ -177,8 +199,10 @@ static int nci_add_new_protocol(struct nci_dev *ndev, protocol = NFC_PROTO_FELICA_MASK; else if (rf_protocol == NCI_RF_PROTOCOL_NFC_DEP) protocol = NFC_PROTO_NFC_DEP_MASK; + else if (rf_protocol == NCI_RF_PROTOCOL_T5T) + protocol = NFC_PROTO_ISO15693_MASK; else - protocol = 0; + protocol = nci_get_prop_rf_protocol(ndev, rf_protocol); if (!(protocol & ndev->poll_prots)) { pr_err("the target found does not have the desired protocol\n"); @@ -211,6 +235,12 @@ static int nci_add_new_protocol(struct nci_dev *ndev, memcpy(target->sensf_res, nfcf_poll->sensf_res, target->sensf_res_len); } + } else if (rf_tech_and_mode == NCI_NFC_V_PASSIVE_POLL_MODE) { + nfcv_poll = (struct rf_tech_specific_params_nfcv_poll *)params; + + target->is_iso15693 = 1; + target->iso15693_dsfid = nfcv_poll->dsfid; + memcpy(target->iso15693_uid, nfcv_poll->uid, NFC_ISO15693_UID_MAXSIZE); } else { pr_err("unsupported rf_tech_and_mode 0x%x\n", rf_tech_and_mode); return -EPROTO; @@ -303,6 +333,11 @@ static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, &(ntf.rf_tech_specific_params.nfcf_poll), data); break; + case NCI_NFC_V_PASSIVE_POLL_MODE: + data = nci_extract_rf_params_nfcv_passive_poll(ndev, + &(ntf.rf_tech_specific_params.nfcv_poll), data); + break; + default: pr_err("unsupported rf_tech_and_mode 0x%x\n", ntf.rf_tech_and_mode); @@ -453,6 +488,11 @@ static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, &(ntf.rf_tech_specific_params.nfcf_poll), data); break; + case NCI_NFC_V_PASSIVE_POLL_MODE: + data = nci_extract_rf_params_nfcv_passive_poll(ndev, + &(ntf.rf_tech_specific_params.nfcv_poll), data); + break; + default: pr_err("unsupported activation_rf_tech_and_mode 0x%x\n", ntf.activation_rf_tech_and_mode); diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 6ecf491ad509..ba3bb8203b99 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -54,3 +54,14 @@ config OPENVSWITCH_VXLAN Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config OPENVSWITCH_GENEVE + bool "Open vSwitch Geneve tunneling support" + depends on INET + depends on OPENVSWITCH + depends on GENEVE && !(OPENVSWITCH=y && GENEVE=m) + default y + ---help--- + If you say Y here, then the Open vSwitch will be able create geneve vport. + + Say N to exclude this support and reduce the binary size. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 3591cb5dae91..9a33a273c375 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,6 +15,10 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o +ifneq ($(CONFIG_OPENVSWITCH_GENEVE),) +openvswitch-y += vport-geneve.o +endif + ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) openvswitch-y += vport-vxlan.o endif diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index e70d8b18e962..006886dbee36 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -35,13 +35,83 @@ #include <net/sctp/checksum.h> #include "datapath.h" +#include "flow.h" #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr, int len, bool keep_skb); + struct sw_flow_key *key, + const struct nlattr *attr, int len); + +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define DEFERRED_ACTION_FIFO_SIZE 10 +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +}; + +static struct action_fifo __percpu *action_fifos; +static DEFINE_PER_CPU(int, exec_actions_level); + +static void action_fifo_init(struct action_fifo *fifo) +{ + fifo->head = 0; + fifo->tail = 0; +} + +static bool action_fifo_is_empty(struct action_fifo *fifo) +{ + return (fifo->head == fifo->tail); +} + +static struct deferred_action *action_fifo_get(struct action_fifo *fifo) +{ + if (action_fifo_is_empty(fifo)) + return NULL; + + return &fifo->fifo[fifo->tail++]; +} + +static struct deferred_action *action_fifo_put(struct action_fifo *fifo) +{ + if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1) + return NULL; + + return &fifo->fifo[fifo->head++]; +} + +/* Return true if fifo is not full */ +static struct deferred_action *add_deferred_actions(struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct action_fifo *fifo; + struct deferred_action *da; + + fifo = this_cpu_ptr(action_fifos); + da = action_fifo_put(fifo); + if (da) { + da->skb = skb; + da->actions = attr; + da->pkt_key = *key; + } + + return da; +} static int make_writable(struct sk_buff *skb, int write_len) { + if (!pskb_may_pull(skb, write_len)) + return -ENOMEM; + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) return 0; @@ -70,6 +140,8 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) vlan_set_encap_proto(skb, vhdr); skb->mac_header += VLAN_HLEN; + if (skb_network_offset(skb) < ETH_HLEN) + skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_len(skb); return 0; @@ -405,16 +477,14 @@ static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) } static int output_userspace(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr) { struct dp_upcall_info upcall; const struct nlattr *a; int rem; - BUG_ON(!OVS_CB(skb)->pkt_key); - upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.key = OVS_CB(skb)->pkt_key; + upcall.key = key; upcall.userdata = NULL; upcall.portid = 0; @@ -434,8 +504,13 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, return ovs_dp_upcall(dp, skb, &upcall); } +static bool last_action(const struct nlattr *a, int rem) +{ + return a->nla_len == rem; +} + static int sample(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr) { const struct nlattr *acts_list = NULL; const struct nlattr *a; @@ -455,8 +530,50 @@ static int sample(struct datapath *dp, struct sk_buff *skb, } } - return do_execute_actions(dp, skb, nla_data(acts_list), - nla_len(acts_list), true); + rem = nla_len(acts_list); + a = nla_data(acts_list); + + /* Actions list is empty, do nothing */ + if (unlikely(!rem)) + return 0; + + /* The only known usage of sample action is having a single user-space + * action. Treat this usage as a special case. + * The output_userspace() should clone the skb to be sent to the + * user space. This skb will be consumed by its caller. + */ + if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && + last_action(a, rem))) + return output_userspace(dp, skb, key, a); + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + /* Skip the sample action when out of memory. */ + return 0; + + if (!add_deferred_actions(skb, key, a)) { + if (net_ratelimit()) + pr_warn("%s: deferred actions limit reached, dropping sample action\n", + ovs_dp_name(dp)); + + kfree_skb(skb); + } + return 0; +} + +static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct ovs_action_hash *hash_act = nla_data(attr); + u32 hash = 0; + + /* OVS_HASH_ALG_L4 is the only possible hash algorithm. */ + hash = skb_get_hash(skb); + hash = jhash_1word(hash, hash_act->hash_basis); + if (!hash) + hash = 0x1; + + key->ovs_flow_hash = hash; } static int execute_set_action(struct sk_buff *skb, @@ -473,8 +590,8 @@ static int execute_set_action(struct sk_buff *skb, skb->mark = nla_get_u32(nested_attr); break; - case OVS_KEY_ATTR_IPV4_TUNNEL: - OVS_CB(skb)->tun_key = nla_data(nested_attr); + case OVS_KEY_ATTR_TUNNEL_INFO: + OVS_CB(skb)->egress_tun_info = nla_data(nested_attr); break; case OVS_KEY_ATTR_ETHERNET: @@ -505,9 +622,48 @@ static int execute_set_action(struct sk_buff *skb, return err; } +static int execute_recirc(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *a, int rem) +{ + struct deferred_action *da; + int err; + + err = ovs_flow_key_update(skb, key); + if (err) + return err; + + if (!last_action(a, rem)) { + /* Recirc action is the not the last action + * of the action list, need to clone the skb. + */ + skb = skb_clone(skb, GFP_ATOMIC); + + /* Skip the recirc action when out of memory, but + * continue on with the rest of the action list. + */ + if (!skb) + return 0; + } + + da = add_deferred_actions(skb, key, NULL); + if (da) { + da->pkt_key.recirc_id = nla_get_u32(a); + } else { + kfree_skb(skb); + + if (net_ratelimit()) + pr_warn("%s: deferred action limit reached, drop recirc action\n", + ovs_dp_name(dp)); + } + + return 0; +} + /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr, int len, bool keep_skb) + struct sw_flow_key *key, + const struct nlattr *attr, int len) { /* Every output action needs a separate clone of 'skb', but the common * case is just a single output action, so that doing a clone and @@ -532,7 +688,11 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_USERSPACE: - output_userspace(dp, skb, a); + output_userspace(dp, skb, key, a); + break; + + case OVS_ACTION_ATTR_HASH: + execute_hash(skb, key, a); break; case OVS_ACTION_ATTR_PUSH_VLAN: @@ -545,12 +705,23 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, err = pop_vlan(skb); break; + case OVS_ACTION_ATTR_RECIRC: + err = execute_recirc(dp, skb, key, a, rem); + if (last_action(a, rem)) { + /* If this is the last action, the skb has + * been consumed or freed. + * Return immediately. + */ + return err; + } + break; + case OVS_ACTION_ATTR_SET: err = execute_set_action(skb, nla_data(a)); break; case OVS_ACTION_ATTR_SAMPLE: - err = sample(dp, skb, a); + err = sample(dp, skb, key, a); if (unlikely(err)) /* skb already freed. */ return err; break; @@ -562,23 +733,72 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } } - if (prev_port != -1) { - if (keep_skb) - skb = skb_clone(skb, GFP_ATOMIC); - + if (prev_port != -1) do_output(dp, skb, prev_port); - } else if (!keep_skb) + else consume_skb(skb); return 0; } +static void process_deferred_actions(struct datapath *dp) +{ + struct action_fifo *fifo = this_cpu_ptr(action_fifos); + + /* Do not touch the FIFO in case there is no deferred actions. */ + if (action_fifo_is_empty(fifo)) + return; + + /* Finishing executing all deferred actions. */ + do { + struct deferred_action *da = action_fifo_get(fifo); + struct sk_buff *skb = da->skb; + struct sw_flow_key *key = &da->pkt_key; + const struct nlattr *actions = da->actions; + + if (actions) + do_execute_actions(dp, skb, key, actions, + nla_len(actions)); + else + ovs_dp_process_packet(skb, key); + } while (!action_fifo_is_empty(fifo)); + + /* Reset FIFO for the next packet. */ + action_fifo_init(fifo); +} + /* Execute a list of actions against 'skb'. */ -int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key) { - struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + int level = this_cpu_read(exec_actions_level); + struct sw_flow_actions *acts; + int err; + + acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + + this_cpu_inc(exec_actions_level); + OVS_CB(skb)->egress_tun_info = NULL; + err = do_execute_actions(dp, skb, key, + acts->actions, acts->actions_len); + + if (!level) + process_deferred_actions(dp); - OVS_CB(skb)->tun_key = NULL; - return do_execute_actions(dp, skb, acts->actions, - acts->actions_len, false); + this_cpu_dec(exec_actions_level); + return err; +} + +int action_fifos_init(void) +{ + action_fifos = alloc_percpu(struct action_fifo); + if (!action_fifos) + return -ENOMEM; + + return 0; +} + +void action_fifos_exit(void) +{ + free_percpu(action_fifos); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9db4bf6740d1..2e31d9e7f4dc 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -47,8 +47,6 @@ #include <linux/openvswitch.h> #include <linux/rculist.h> #include <linux/dmi.h> -#include <linux/genetlink.h> -#include <net/genetlink.h> #include <net/genetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -66,25 +64,26 @@ static struct genl_family dp_packet_genl_family; static struct genl_family dp_flow_genl_family; static struct genl_family dp_datapath_genl_family; -static struct genl_multicast_group ovs_dp_flow_multicast_group = { - .name = OVS_FLOW_MCGROUP +static const struct genl_multicast_group ovs_dp_flow_multicast_group = { + .name = OVS_FLOW_MCGROUP, }; -static struct genl_multicast_group ovs_dp_datapath_multicast_group = { - .name = OVS_DATAPATH_MCGROUP +static const struct genl_multicast_group ovs_dp_datapath_multicast_group = { + .name = OVS_DATAPATH_MCGROUP, }; -struct genl_multicast_group ovs_dp_vport_multicast_group = { - .name = OVS_VPORT_MCGROUP +static const struct genl_multicast_group ovs_dp_vport_multicast_group = { + .name = OVS_VPORT_MCGROUP, }; /* Check if need to build a reply message. * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */ -static bool ovs_must_notify(struct genl_info *info, - const struct genl_multicast_group *grp) +static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, + unsigned int group) { return info->nlhdr->nlmsg_flags & NLM_F_ECHO || - netlink_has_listeners(genl_info_net(info)->genl_sock, 0); + genl_has_listeners(family, genl_info_net(info)->genl_sock, + group); } static void ovs_notify(struct genl_family *family, @@ -158,7 +157,7 @@ static struct datapath *get_dp(struct net *net, int dp_ifindex) } /* Must be called with rcu_read_lock or ovs_mutex. */ -static const char *ovs_dp_name(const struct datapath *dp) +const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return vport->ops->get_name(vport); @@ -239,45 +238,40 @@ void ovs_dp_detach_port(struct vport *p) } /* Must be called with rcu_read_lock. */ -void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { + const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct dp_stats_percpu *stats; - struct sw_flow_key key; u64 *stats_counter; u32 n_mask_hit; - int error; stats = this_cpu_ptr(dp->stats_percpu); - /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_extract(skb, p->port_no, &key); - if (unlikely(error)) { - kfree_skb(skb); - return; - } - /* Look up flow. */ - flow = ovs_flow_tbl_lookup_stats(&dp->table, &key, &n_mask_hit); + flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; + int error; upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.key = &key; + upcall.key = key; upcall.userdata = NULL; - upcall.portid = p->upcall_portid; - ovs_dp_upcall(dp, skb, &upcall); - consume_skb(skb); + upcall.portid = ovs_vport_find_upcall_portid(p, skb); + error = ovs_dp_upcall(dp, skb, &upcall); + if (unlikely(error)) + kfree_skb(skb); + else + consume_skb(skb); stats_counter = &stats->n_missed; goto out; } OVS_CB(skb)->flow = flow; - OVS_CB(skb)->pkt_key = &key; - ovs_flow_stats_update(OVS_CB(skb)->flow, key.tp.flags, skb); - ovs_execute_actions(dp, skb); + ovs_flow_stats_update(OVS_CB(skb)->flow, key->tp.flags, skb); + ovs_execute_actions(dp, skb, key); stats_counter = &stats->n_hit; out: @@ -375,6 +369,8 @@ static size_t key_attr_size(void) + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ @@ -406,7 +402,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, { struct ovs_header *upcall; struct sk_buff *nskb = NULL; - struct sk_buff *user_skb; /* to be queued to userspace */ + struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; struct genl_info info = { .dst_sk = ovs_dp_get_net(dp)->genl_sock, @@ -464,7 +460,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, upcall->dp_ifindex = dp_ifindex; nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); - ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb); + err = ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb); + BUG_ON(err); nla_nest_end(user_skb, nla); if (upcall_info->userdata) @@ -495,9 +492,11 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); + user_skb = NULL; out: if (err) skb_tx_error(skb); + kfree_skb(user_skb); kfree_skb(nskb); return err; } @@ -511,6 +510,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct sw_flow *flow; struct datapath *dp; struct ethhdr *eth; + struct vport *input_vport; int len; int err; @@ -545,13 +545,11 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(flow)) goto err_kfree_skb; - err = ovs_flow_extract(packet, -1, &flow->key); + err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet, + &flow->key); if (err) goto err_flow_free; - err = ovs_nla_get_flow_metadata(flow, a[OVS_PACKET_ATTR_KEY]); - if (err) - goto err_flow_free; acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); err = PTR_ERR(acts); if (IS_ERR(acts)) @@ -559,12 +557,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts); - rcu_assign_pointer(flow->sf_acts, acts); if (err) goto err_flow_free; + rcu_assign_pointer(flow->sf_acts, acts); + + OVS_CB(packet)->egress_tun_info = NULL; OVS_CB(packet)->flow = flow; - OVS_CB(packet)->pkt_key = &flow->key; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; @@ -574,8 +573,17 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (!dp) goto err_unlock; + input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port); + if (!input_vport) + input_vport = ovs_vport_rcu(dp, OVSP_LOCAL); + + if (!input_vport) + goto err_unlock; + + OVS_CB(packet)->input_vport = input_vport; + local_bh_disable(); - err = ovs_execute_actions(dp, packet); + err = ovs_execute_actions(dp, packet, &flow->key); local_bh_enable(); rcu_read_unlock(); @@ -759,7 +767,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act { struct sk_buff *skb; - if (!always && !ovs_must_notify(info, &ovs_dp_flow_multicast_group)) + if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0)) return NULL; skb = genlmsg_new_unicast(ovs_flow_cmd_msg_size(acts), info, GFP_KERNEL); @@ -780,7 +788,7 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), info, always); - if (!skb || IS_ERR(skb)) + if (IS_ERR_OR_NULL(skb)) return skb; retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, @@ -928,11 +936,34 @@ error: return error; } +static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, + const struct sw_flow_key *key, + const struct sw_flow_mask *mask) +{ + struct sw_flow_actions *acts; + struct sw_flow_key masked_key; + int error; + + acts = ovs_nla_alloc_flow_actions(nla_len(a)); + if (IS_ERR(acts)) + return acts; + + ovs_flow_mask_key(&masked_key, key, mask); + error = ovs_nla_copy_actions(a, &masked_key, 0, &acts); + if (error) { + OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); + kfree(acts); + return ERR_PTR(error); + } + + return acts; +} + static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; - struct sw_flow_key key, masked_key; + struct sw_flow_key key; struct sw_flow *flow; struct sw_flow_mask mask; struct sk_buff *reply = NULL; @@ -954,17 +985,10 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { - acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); - error = PTR_ERR(acts); - if (IS_ERR(acts)) + acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask); + if (IS_ERR(acts)) { + error = PTR_ERR(acts); goto error; - - ovs_flow_mask_key(&masked_key, &key, &mask); - error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], - &masked_key, 0, &acts); - if (error) { - OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); - goto err_kfree_acts; } } @@ -1189,7 +1213,7 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, }; -static struct genl_ops dp_flow_genl_ops[] = { +static const struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = flow_policy, @@ -1373,7 +1397,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.options = NULL; parms.dp = dp; parms.port_no = OVSP_LOCAL; - parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]); + parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; ovs_dp_change(dp, a); @@ -1577,7 +1601,7 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, }; -static struct genl_ops dp_datapath_genl_ops[] = { +static const struct genl_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = datapath_policy, @@ -1632,8 +1656,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || - nla_put_string(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport)) || - nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_portid)) + nla_put_string(skb, OVS_VPORT_ATTR_NAME, + vport->ops->get_name(vport))) goto nla_put_failure; ovs_vport_get_stats(vport, &vport_stats); @@ -1641,6 +1665,9 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, &vport_stats)) goto nla_put_failure; + if (ovs_vport_get_upcall_portids(vport, skb)) + goto nla_put_failure; + err = ovs_vport_get_options(vport, skb); if (err == -EMSGSIZE) goto error; @@ -1762,7 +1789,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.options = a[OVS_VPORT_ATTR_OPTIONS]; parms.dp = dp; parms.port_no = port_no; - parms.upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; vport = new_vport(&parms); err = PTR_ERR(vport); @@ -1812,8 +1839,14 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) goto exit_unlock_free; } - if (a[OVS_VPORT_ATTR_UPCALL_PID]) - vport->upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + + if (a[OVS_VPORT_ATTR_UPCALL_PID]) { + struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID]; + + err = ovs_vport_set_upcall_portids(vport, ids); + if (err) + goto exit_unlock_free; + } err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); @@ -1944,7 +1977,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, }; -static struct genl_ops dp_vport_genl_ops[] = { +static const struct genl_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .policy = vport_policy, @@ -2053,10 +2086,18 @@ static int __init dp_init(void) pr_info("Open vSwitch switching datapath\n"); - err = ovs_flow_init(); + err = action_fifos_init(); if (err) goto error; + err = ovs_internal_dev_rtnl_link_register(); + if (err) + goto error_action_fifos_exit; + + err = ovs_flow_init(); + if (err) + goto error_unreg_rtnl_link; + err = ovs_vport_init(); if (err) goto error_flow_exit; @@ -2083,6 +2124,10 @@ error_vport_exit: ovs_vport_exit(); error_flow_exit: ovs_flow_exit(); +error_unreg_rtnl_link: + ovs_internal_dev_rtnl_link_unregister(); +error_action_fifos_exit: + action_fifos_exit(); error: return err; } @@ -2095,6 +2140,8 @@ static void dp_cleanup(void) rcu_barrier(); ovs_vport_exit(); ovs_flow_exit(); + ovs_internal_dev_rtnl_link_unregister(); + action_fifos_exit(); } module_init(dp_init); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 7ede507500d7..974135439c5c 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -95,14 +95,15 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB * @flow: The flow associated with this packet. May be %NULL if no flow. - * @pkt_key: The flow information extracted from the packet. Must be nonnull. - * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the - * packet is not being tunneled. + * @egress_tun_key: Tunnel information about this packet on egress path. + * NULL if the packet is not being tunneled. + * @input_vport: The original vport packet came in on. This value is cached + * when a packet is received by OVS. */ struct ovs_skb_cb { struct sw_flow *flow; - struct sw_flow_key *pkt_key; - struct ovs_key_ipv4_tunnel *tun_key; + struct ovs_tunnel_info *egress_tun_info; + struct vport *input_vport; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -144,7 +145,7 @@ int lockdep_ovsl_is_held(void); #define lockdep_ovsl_is_held() 1 #endif -#define ASSERT_OVSL() WARN_ON(unlikely(!lockdep_ovsl_is_held())) +#define ASSERT_OVSL() WARN_ON(!lockdep_ovsl_is_held()) #define ovsl_dereference(p) \ rcu_dereference_protected(p, lockdep_ovsl_is_held()) #define rcu_dereference_ovsl(p) \ @@ -183,17 +184,23 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; -void ovs_dp_process_received_packet(struct vport *, struct sk_buff *); +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); +const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); -int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *); + void ovs_dp_notify_wq(struct work_struct *work); +int action_fifos_init(void); +void action_fifos_exit(void); + #define OVS_NLERR(fmt, ...) \ do { \ if (net_ratelimit()) \ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index d07ab538fc9d..62db02ba36bc 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -16,8 +16,6 @@ * 02110-1301, USA */ -#include "flow.h" -#include "datapath.h" #include <linux/uaccess.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> @@ -46,6 +44,10 @@ #include <net/ipv6.h> #include <net/ndisc.h> +#include "datapath.h" +#include "flow.h" +#include "flow_netlink.h" + u64 ovs_flow_used_time(unsigned long flow_jiffies) { struct timespec cur_ts; @@ -89,7 +91,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, * allocated stats as we have already locked them. */ if (likely(flow->stats_last_writer != NUMA_NO_NODE) - && likely(!rcu_dereference(flow->stats[node]))) { + && likely(!rcu_access_pointer(flow->stats[node]))) { /* Try to allocate node-specific stats. */ struct flow_stats *new_stats; @@ -420,10 +422,9 @@ invalid: } /** - * ovs_flow_extract - extracts a flow key from an Ethernet frame. + * key_extract - extracts a flow key from an Ethernet frame. * @skb: sk_buff that contains the frame, with skb->data pointing to the * Ethernet header - * @in_port: port number on which @skb was received. * @key: output flow key * * The caller must ensure that skb->len >= ETH_HLEN. @@ -442,18 +443,13 @@ invalid: * of a correct length, otherwise the same as skb->network_header. * For other key->eth.type values it is left untouched. */ -int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) +static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) { int error; struct ethhdr *eth; - memset(key, 0, sizeof(*key)); - - key->phy.priority = skb->priority; - if (OVS_CB(skb)->tun_key) - memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key)); - key->phy.in_port = in_port; - key->phy.skb_mark = skb->mark; + /* Flags are always used as part of stats */ + key->tp.flags = 0; skb_reset_mac_header(skb); @@ -469,6 +465,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) * update skb->csum here. */ + key->eth.tci = 0; if (vlan_tx_tag_present(skb)) key->eth.tci = htons(skb->vlan_tci); else if (eth->h_proto == htons(ETH_P_8021Q)) @@ -489,6 +486,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) error = check_iphdr(skb); if (unlikely(error)) { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); if (error == -EINVAL) { skb->transport_header = skb->network_header; error = 0; @@ -510,8 +509,10 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) return 0; } if (nh->frag_off & htons(IP_MF) || - skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; + else + key->ip.frag = OVS_FRAG_TYPE_NONE; /* Transport layer. */ if (key->ip.proto == IPPROTO_TCP) { @@ -520,18 +521,25 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) key->tp.src = tcp->source; key->tp.dst = tcp->dest; key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } + } else if (key->ip.proto == IPPROTO_UDP) { if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->tp.src = udp->source; key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == IPPROTO_SCTP) { if (sctphdr_ok(skb)) { struct sctphdr *sctp = sctp_hdr(skb); key->tp.src = sctp->source; key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == IPPROTO_ICMP) { if (icmphdr_ok(skb)) { @@ -541,33 +549,44 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) * them in 16-bit network byte order. */ key->tp.src = htons(icmp->type); key->tp.dst = htons(icmp->code); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } - } else if ((key->eth.type == htons(ETH_P_ARP) || - key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) { + } else if (key->eth.type == htons(ETH_P_ARP) || + key->eth.type == htons(ETH_P_RARP)) { struct arp_eth_header *arp; arp = (struct arp_eth_header *)skb_network_header(skb); - if (arp->ar_hrd == htons(ARPHRD_ETHER) - && arp->ar_pro == htons(ETH_P_IP) - && arp->ar_hln == ETH_ALEN - && arp->ar_pln == 4) { + if (arphdr_ok(skb) && + arp->ar_hrd == htons(ARPHRD_ETHER) && + arp->ar_pro == htons(ETH_P_IP) && + arp->ar_hln == ETH_ALEN && + arp->ar_pln == 4) { /* We only match on the lower 8 bits of the opcode. */ if (ntohs(arp->ar_op) <= 0xff) key->ip.proto = ntohs(arp->ar_op); + else + key->ip.proto = 0; + memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha); ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha); + } else { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ nh_len = parse_ipv6hdr(skb, key); if (unlikely(nh_len < 0)) { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); if (nh_len == -EINVAL) { skb->transport_header = skb->network_header; error = 0; @@ -589,27 +608,87 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) key->tp.src = tcp->source; key->tp.dst = tcp->dest; key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == NEXTHDR_UDP) { if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->tp.src = udp->source; key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == NEXTHDR_SCTP) { if (sctphdr_ok(skb)) { struct sctphdr *sctp = sctp_hdr(skb); key->tp.src = sctp->source; key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == NEXTHDR_ICMP) { if (icmp6hdr_ok(skb)) { error = parse_icmpv6(skb, key, nh_len); if (error) return error; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } } - return 0; } + +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) +{ + return key_extract(skb, key); +} + +int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, + struct sk_buff *skb, struct sw_flow_key *key) +{ + /* Extract metadata from packet. */ + if (tun_info) { + memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); + + if (tun_info->options) { + BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * + 8)) - 1 + > sizeof(key->tun_opts)); + memcpy(GENEVE_OPTS(key, tun_info->options_len), + tun_info->options, tun_info->options_len); + key->tun_opts_len = tun_info->options_len; + } else { + key->tun_opts_len = 0; + } + } else { + key->tun_opts_len = 0; + memset(&key->tun_key, 0, sizeof(key->tun_key)); + } + + key->phy.priority = skb->priority; + key->phy.in_port = OVS_CB(skb)->input_vport->port_no; + key->phy.skb_mark = skb->mark; + key->ovs_flow_hash = 0; + key->recirc_id = 0; + + /* Flags are always used as part of stats */ + key->tp.flags = 0; + + return key_extract(skb, key); +} + +int ovs_flow_key_extract_userspace(const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key) +{ + int err; + + /* Extract metadata from netlink attributes. */ + err = ovs_nla_get_flow_metadata(attr, key); + if (err) + return err; + + return key_extract(skb, key); +} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 5e5aaed3a85b..71813318c8c7 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -49,29 +49,53 @@ struct ovs_key_ipv4_tunnel { u8 ipv4_ttl; } __packed __aligned(4); /* Minimize padding. */ -static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, - const struct iphdr *iph, __be64 tun_id, - __be16 tun_flags) +struct ovs_tunnel_info { + struct ovs_key_ipv4_tunnel tunnel; + struct geneve_opt *options; + u8 options_len; +}; + +/* Store options at the end of the array if they are less than the + * maximum size. This allows us to get the benefits of variable length + * matching for small options. + */ +#define GENEVE_OPTS(flow_key, opt_len) \ + ((struct geneve_opt *)((flow_key)->tun_opts + \ + FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \ + opt_len)) + +static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, + const struct iphdr *iph, + __be64 tun_id, __be16 tun_flags, + struct geneve_opt *opts, + u8 opts_len) { - tun_key->tun_id = tun_id; - tun_key->ipv4_src = iph->saddr; - tun_key->ipv4_dst = iph->daddr; - tun_key->ipv4_tos = iph->tos; - tun_key->ipv4_ttl = iph->ttl; - tun_key->tun_flags = tun_flags; + tun_info->tunnel.tun_id = tun_id; + tun_info->tunnel.ipv4_src = iph->saddr; + tun_info->tunnel.ipv4_dst = iph->daddr; + tun_info->tunnel.ipv4_tos = iph->tos; + tun_info->tunnel.ipv4_ttl = iph->ttl; + tun_info->tunnel.tun_flags = tun_flags; /* clear struct padding. */ - memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0, - sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE); + memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0, + sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); + + tun_info->options = opts; + tun_info->options_len = opts_len; } struct sw_flow_key { + u8 tun_opts[255]; + u8 tun_opts_len; struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ } __packed phy; /* Safe when right after 'tun_key'. */ + u32 ovs_flow_hash; /* Datapath computed hash value. */ + u32 recirc_id; /* Recirculation ID. */ struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ @@ -187,6 +211,12 @@ void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); -int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, + struct sw_flow_key *key); +/* Extract key from packet coming from userspace. */ +int ovs_flow_key_extract_userspace(const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key); #endif /* flow.h */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d757848da89c..368f23307911 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -42,6 +42,7 @@ #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/rculist.h> +#include <net/geneve.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> @@ -88,18 +89,20 @@ static void update_range__(struct sw_flow_match *match, } \ } while (0) -#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - len, is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - memcpy(&(match)->mask->key.field, value_p, len);\ - } else { \ - memcpy(&(match)->key->field, value_p, len); \ - } \ +#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ + do { \ + update_range__(match, offset, len, is_mask); \ + if (is_mask) \ + memcpy((u8 *)&(match)->mask->key + offset, value_p, \ + len); \ + else \ + memcpy((u8 *)(match)->key + offset, value_p, len); \ } while (0) +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ + value_p, len, is_mask) + static u16 range_n_bytes(const struct sw_flow_key_range *range) { return range->end - range->start; @@ -251,6 +254,8 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), + [OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32), + [OVS_KEY_ATTR_DP_HASH] = sizeof(u32), [OVS_KEY_ATTR_TUNNEL] = -1, }; @@ -333,6 +338,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, int rem; bool ttl = false; __be16 tun_flags = 0; + unsigned long opt_key_offset; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); @@ -344,6 +350,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_TTL] = 1, [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, + [OVS_TUNNEL_KEY_ATTR_OAM] = 0, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1, }; if (type > OVS_TUNNEL_KEY_ATTR_MAX) { @@ -352,7 +360,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - if (ovs_tunnel_key_lens[type] != nla_len(a)) { + if (ovs_tunnel_key_lens[type] != nla_len(a) && + ovs_tunnel_key_lens[type] != -1) { OVS_NLERR("IPv4 tunnel attribute type has unexpected " " length (type=%d, length=%d, expected=%d).\n", type, nla_len(a), ovs_tunnel_key_lens[type]); @@ -388,7 +397,63 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_CSUM: tun_flags |= TUNNEL_CSUM; break; + case OVS_TUNNEL_KEY_ATTR_OAM: + tun_flags |= TUNNEL_OAM; + break; + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + tun_flags |= TUNNEL_OPTIONS_PRESENT; + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR("Geneve option length exceeds maximum size (len %d, max %zu).\n", + nla_len(a), + sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (nla_len(a) % 4 != 0) { + OVS_NLERR("Geneve option length is not a multiple of 4 (len %d).\n", + nla_len(a)); + return -EINVAL; + } + + /* We need to record the length of the options passed + * down, otherwise packets with the same format but + * additional options will be silently matched. + */ + if (!is_mask) { + SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), + false); + } else { + /* This is somewhat unusual because it looks at + * both the key and mask while parsing the + * attributes (and by extension assumes the key + * is parsed first). Normally, we would verify + * that each is the correct length and that the + * attributes line up in the validate function. + * However, that is difficult because this is + * variable length and we won't have the + * information later. + */ + if (match->key->tun_opts_len != nla_len(a)) { + OVS_NLERR("Geneve option key length (%d) is different from mask length (%d).", + match->key->tun_opts_len, + nla_len(a)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, + true); + } + + opt_key_offset = (unsigned long)GENEVE_OPTS( + (struct sw_flow_key *)0, + nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, + nla_data(a), nla_len(a), + is_mask); + break; default: + OVS_NLERR("Unknown IPv4 tunnel attribute (%d).\n", + type); return -EINVAL; } } @@ -415,45 +480,80 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return 0; } -static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key, - const struct ovs_key_ipv4_tunnel *output) +static int __ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const struct geneve_opt *tun_opts, + int swkey_tun_opts_len) { - struct nlattr *nla; - - nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); - if (!nla) - return -EMSGSIZE; - if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; if (output->ipv4_src && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) return -EMSGSIZE; if (output->ipv4_dst && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) return -EMSGSIZE; if (output->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) return -EMSGSIZE; if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_CSUM) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_OAM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) + return -EMSGSIZE; + if (tun_opts && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, + swkey_tun_opts_len, tun_opts)) return -EMSGSIZE; - nla_nest_end(skb, nla); return 0; } +static int ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const struct geneve_opt *tun_opts, + int swkey_tun_opts_len) +{ + struct nlattr *nla; + int err; + + nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len); + if (err) + return err; + + nla_nest_end(skb, nla); + return 0; +} + static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask) { + if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { + u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); + + SW_FLOW_KEY_PUT(match, ovs_flow_hash, hash_val, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_DP_HASH); + } + + if (*attrs & (1 << OVS_KEY_ATTR_RECIRC_ID)) { + u32 recirc_id = nla_get_u32(a[OVS_KEY_ATTR_RECIRC_ID]); + + SW_FLOW_KEY_PUT(match, recirc_id, recirc_id, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_RECIRC_ID); + } + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { SW_FLOW_KEY_PUT(match, phy.priority, nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); @@ -836,7 +936,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, /** * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. - * @flow: Receives extracted in_port, priority, tun_key and skb_mark. + * @key: Receives extracted in_port, priority, tun_key and skb_mark. * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. * @@ -846,32 +946,24 @@ int ovs_nla_get_match(struct sw_flow_match *match, * extracted from the packet itself. */ -int ovs_nla_get_flow_metadata(struct sw_flow *flow, - const struct nlattr *attr) +int ovs_nla_get_flow_metadata(const struct nlattr *attr, + struct sw_flow_key *key) { - struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + struct sw_flow_match match; u64 attrs = 0; int err; - struct sw_flow_match match; - - flow->key.phy.in_port = DP_MAX_PORTS; - flow->key.phy.priority = 0; - flow->key.phy.skb_mark = 0; - memset(tun_key, 0, sizeof(flow->key.tun_key)); err = parse_flow_nlattrs(attr, a, &attrs); if (err) return -EINVAL; memset(&match, 0, sizeof(match)); - match.key = &flow->key; + match.key = key; - err = metadata_from_nlattrs(&match, &attrs, a, false); - if (err) - return err; + key->phy.in_port = DP_MAX_PORTS; - return 0; + return metadata_from_nlattrs(&match, &attrs, a, false); } int ovs_nla_put_flow(const struct sw_flow_key *swkey, @@ -881,13 +973,26 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, struct nlattr *nla, *encap; bool is_mask = (swkey != output); - if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) + if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash)) goto nla_put_failure; - if ((swkey->tun_key.ipv4_dst || is_mask) && - ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; + if ((swkey->tun_key.ipv4_dst || is_mask)) { + const struct geneve_opt *opts = NULL; + + if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) + opts = GENEVE_OPTS(output, swkey->tun_opts_len); + + if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len)) + goto nla_put_failure; + } + if (swkey->phy.in_port == DP_MAX_PORTS) { if (is_mask && (output->phy.in_port == 0xffff)) if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) @@ -1127,13 +1232,14 @@ out: return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); } -static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) +static struct nlattr *__add_action(struct sw_flow_actions **sfa, + int attrtype, void *data, int len) { struct nlattr *a; a = reserve_sfa_size(sfa, nla_attr_size(len)); if (IS_ERR(a)) - return PTR_ERR(a); + return a; a->nla_type = attrtype; a->nla_len = nla_attr_size(len); @@ -1142,6 +1248,18 @@ static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, in memcpy(nla_data(a), data, len); memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + return a; +} + +static int add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len) +{ + struct nlattr *a; + + a = __add_action(sfa, attrtype, data, len); + if (IS_ERR(a)) + return PTR_ERR(a); + return 0; } @@ -1247,6 +1365,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; + struct ovs_tunnel_info *tun_info; + struct nlattr *a; int err, start; ovs_match_init(&match, &key, NULL); @@ -1254,12 +1374,56 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (err) return err; + if (key.tun_opts_len) { + struct geneve_opt *option = GENEVE_OPTS(&key, + key.tun_opts_len); + int opts_len = key.tun_opts_len; + bool crit_opt = false; + + while (opts_len > 0) { + int len; + + if (opts_len < sizeof(*option)) + return -EINVAL; + + len = sizeof(*option) + option->length * 4; + if (len > opts_len) + return -EINVAL; + + crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE); + + option = (struct geneve_opt *)((u8 *)option + len); + opts_len -= len; + }; + + key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; + }; + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); if (start < 0) return start; - err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, - sizeof(match.key->tun_key)); + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, + sizeof(*tun_info) + key.tun_opts_len); + if (IS_ERR(a)) + return PTR_ERR(a); + + tun_info = nla_data(a); + tun_info->tunnel = key.tun_key; + tun_info->options_len = key.tun_opts_len; + + if (tun_info->options_len) { + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + memcpy((tun_info + 1), GENEVE_OPTS(&key, key.tun_opts_len), + key.tun_opts_len); + tun_info->options = (struct geneve_opt *)(tun_info + 1); + } else { + tun_info->options = NULL; + } + add_nested_action_end(*sfa, start); return err; @@ -1409,11 +1573,13 @@ int ovs_nla_copy_actions(const struct nlattr *attr, /* Expected argument lengths, (u32)-1 for variable length. */ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, - [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -1440,6 +1606,18 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_HASH: { + const struct ovs_action_hash *act_hash = nla_data(a); + + switch (act_hash->hash_alg) { + case OVS_HASH_ALG_L4: + break; + default: + return -EINVAL; + } + + break; + } case OVS_ACTION_ATTR_POP_VLAN: break; @@ -1452,6 +1630,9 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_RECIRC: + break; + case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, &skip_copy); if (err) @@ -1525,17 +1706,22 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) int err; switch (key_type) { - case OVS_KEY_ATTR_IPV4_TUNNEL: + case OVS_KEY_ATTR_TUNNEL_INFO: { + struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key), - nla_data(ovs_key)); + err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, + tun_info->options_len ? + tun_info->options : NULL, + tun_info->options_len); if (err) return err; nla_nest_end(skb, start); break; + } default: if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) return -EMSGSIZE; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 440151045d39..206e45add888 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -42,8 +42,8 @@ void ovs_match_init(struct sw_flow_match *match, int ovs_nla_put_flow(const struct sw_flow_key *, const struct sw_flow_key *, struct sk_buff *); -int ovs_nla_get_flow_metadata(struct sw_flow *flow, - const struct nlattr *attr); +int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *); + int ovs_nla_get_match(struct sw_flow_match *match, const struct nlattr *, const struct nlattr *); diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c new file mode 100644 index 000000000000..910b3ef2c0d5 --- /dev/null +++ b/net/openvswitch/vport-geneve.c @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/version.h> + +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/net.h> +#include <linux/rculist.h> +#include <linux/udp.h> +#include <linux/if_vlan.h> + +#include <net/geneve.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/udp.h> +#include <net/xfrm.h> + +#include "datapath.h" +#include "vport.h" + +/** + * struct geneve_port - Keeps track of open UDP ports + * @sock: The socket created for this port number. + * @name: vport name. + */ +struct geneve_port { + struct geneve_sock *gs; + char name[IFNAMSIZ]; +}; + +static LIST_HEAD(geneve_ports); + +static inline struct geneve_port *geneve_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) +{ + return (struct genevehdr *)(udp_hdr(skb) + 1); +} + +/* Convert 64 bit tunnel ID to 24 bit VNI. */ +static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) +{ +#ifdef __BIG_ENDIAN + vni[0] = (__force __u8)(tun_id >> 16); + vni[1] = (__force __u8)(tun_id >> 8); + vni[2] = (__force __u8)tun_id; +#else + vni[0] = (__force __u8)((__force u64)tun_id >> 40); + vni[1] = (__force __u8)((__force u64)tun_id >> 48); + vni[2] = (__force __u8)((__force u64)tun_id >> 56); +#endif +} + +/* Convert 24 bit VNI to 64 bit tunnel ID. */ +static __be64 vni_to_tunnel_id(__u8 *vni) +{ +#ifdef __BIG_ENDIAN + return (vni[0] << 16) | (vni[1] << 8) | vni[2]; +#else + return (__force __be64)(((__force u64)vni[0] << 40) | + ((__force u64)vni[1] << 48) | + ((__force u64)vni[2] << 56)); +#endif +} + +static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) +{ + struct vport *vport = gs->rcv_data; + struct genevehdr *geneveh = geneve_hdr(skb); + int opts_len; + struct ovs_tunnel_info tun_info; + __be64 key; + __be16 flags; + + opts_len = geneveh->opt_len * 4; + + flags = TUNNEL_KEY | TUNNEL_OPTIONS_PRESENT | + (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | + (geneveh->oam ? TUNNEL_OAM : 0) | + (geneveh->critical ? TUNNEL_CRIT_OPT : 0); + + key = vni_to_tunnel_id(geneveh->vni); + + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, flags, + geneveh->options, opts_len); + + ovs_vport_receive(vport, skb, &tun_info); +} + +static int geneve_get_options(const struct vport *vport, + struct sk_buff *skb) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + struct inet_sock *sk = inet_sk(geneve_port->gs->sock->sk); + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport))) + return -EMSGSIZE; + return 0; +} + +static void geneve_tnl_destroy(struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + geneve_sock_release(geneve_port->gs); + + ovs_vport_deferred_free(vport); +} + +static struct vport *geneve_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct geneve_port *geneve_port; + struct geneve_sock *gs; + struct vport *vport; + struct nlattr *a; + int err; + u16 dst_port; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct geneve_port), + &ovs_geneve_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + geneve_port = geneve_vport(vport); + strncpy(geneve_port->name, parms->name, IFNAMSIZ); + + gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0); + if (IS_ERR(gs)) { + ovs_vport_free(vport); + return (void *)gs; + } + geneve_port->gs = gs; + + return vport; +error: + return ERR_PTR(err); +} + +static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct ovs_key_ipv4_tunnel *tun_key; + struct ovs_tunnel_info *tun_info; + struct net *net = ovs_dp_get_net(vport->dp); + struct geneve_port *geneve_port = geneve_vport(vport); + __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; + __be16 sport; + struct rtable *rt; + struct flowi4 fl; + u8 vni[3]; + __be16 df; + int err; + + tun_info = OVS_CB(skb)->egress_tun_info; + if (unlikely(!tun_info)) { + err = -EINVAL; + goto error; + } + + tun_key = &tun_info->tunnel; + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + tunnel_id_to_vni(tun_key->tun_id, vni); + skb->ignore_df = 1; + + err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, + tun_key->ipv4_dst, tun_key->ipv4_tos, + tun_key->ipv4_ttl, df, sport, dport, + tun_key->tun_flags, vni, + tun_info->options_len, (u8 *)tun_info->options, + false); + if (err < 0) + ip_rt_put(rt); +error: + return err; +} + +static const char *geneve_get_name(const struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + return geneve_port->name; +} + +const struct vport_ops ovs_geneve_vport_ops = { + .type = OVS_VPORT_TYPE_GENEVE, + .create = geneve_tnl_create, + .destroy = geneve_tnl_destroy, + .get_name = geneve_get_name, + .get_options = geneve_get_options, + .send = geneve_tnl_send, +}; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index f49148a07da2..108b82da2fd9 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -63,8 +63,10 @@ static __be16 filter_tnl_flags(__be16 flags) static struct sk_buff *__build_header(struct sk_buff *skb, int tunnel_hlen) { - const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key; struct tnl_ptk_info tpi; + const struct ovs_key_ipv4_tunnel *tun_key; + + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); if (IS_ERR(skb)) @@ -92,7 +94,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq) static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { - struct ovs_key_ipv4_tunnel tun_key; + struct ovs_tunnel_info tun_info; struct ovs_net *ovs_net; struct vport *vport; __be64 key; @@ -103,10 +105,10 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_REJECT; key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, - filter_tnl_flags(tpi->flags)); + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, + filter_tnl_flags(tpi->flags), NULL, 0); - ovs_vport_receive(vport, skb, &tun_key); + ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; } @@ -129,6 +131,7 @@ static int gre_err(struct sk_buff *skb, u32 info, static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); + struct ovs_key_ipv4_tunnel *tun_key; struct flowi4 fl; struct rtable *rt; int min_headroom; @@ -136,16 +139,17 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 df; int err; - if (unlikely(!OVS_CB(skb)->tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; goto error; } + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; /* Route lookup */ memset(&fl, 0, sizeof(fl)); - fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; - fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; - fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_GRE; @@ -153,7 +157,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) if (IS_ERR(rt)) return PTR_ERR(rt); - tunnel_hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags); + tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags); min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + tunnel_hlen + sizeof(struct iphdr) @@ -185,15 +189,14 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) goto err_free_rt; } - df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; skb->ignore_df = 1; return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, - OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, - OVS_CB(skb)->tun_key->ipv4_tos, - OVS_CB(skb)->tun_key->ipv4_ttl, df, false); + tun_key->ipv4_dst, IPPROTO_GRE, + tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false); err_free_rt: ip_rt_put(rt); error: diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 789af9280e77..84516126e5f3 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -26,6 +26,7 @@ #include <net/dst.h> #include <net/xfrm.h> +#include <net/rtnetlink.h> #include "datapath.h" #include "vport-internal_dev.h" @@ -121,6 +122,10 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_get_stats64 = internal_dev_get_stats, }; +static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { + .kind = "openvswitch", +}; + static void do_setup(struct net_device *netdev) { ether_setup(netdev); @@ -131,14 +136,18 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; + netdev->rtnl_link_ops = &internal_dev_link_ops; netdev->tx_queue_len = 0; netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | - NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; + NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | + NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL; netdev->vlan_features = netdev->features; + netdev->hw_enc_features = netdev->features; netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; netdev->hw_features = netdev->features & ~NETIF_F_LLTX; + eth_hw_addr_random(netdev); } @@ -159,7 +168,8 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) netdev_vport = netdev_vport_priv(vport); netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, do_setup); + parms->name, NET_NAME_UNKNOWN, + do_setup); if (!netdev_vport->dev) { err = -ENOMEM; goto error_free_vport; @@ -248,3 +258,13 @@ struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) return internal_dev_priv(netdev)->vport; } + +int ovs_internal_dev_rtnl_link_register(void) +{ + return rtnl_link_register(&internal_dev_link_ops); +} + +void ovs_internal_dev_rtnl_link_unregister(void) +{ + rtnl_link_unregister(&internal_dev_link_ops); +} diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h index 9a7d30ecc6a2..1b179a190cff 100644 --- a/net/openvswitch/vport-internal_dev.h +++ b/net/openvswitch/vport-internal_dev.h @@ -24,5 +24,7 @@ int ovs_is_internal_dev(const struct net_device *); struct vport *ovs_internal_dev_get_vport(struct net_device *); +int ovs_internal_dev_rtnl_link_register(void); +void ovs_internal_dev_rtnl_link_unregister(void); #endif /* vport-internal_dev.h */ diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 0edbd95c60e7..2735e01dca73 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Nicira, Inc. + * Copyright (c) 2014 Nicira, Inc. * Copyright (c) 2013 Cisco Systems, Inc. * * This program is free software; you can redistribute it and/or @@ -58,7 +58,7 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport) /* Called with rcu_read_lock and BH disabled. */ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) { - struct ovs_key_ipv4_tunnel tun_key; + struct ovs_tunnel_info tun_info; struct vport *vport = vs->data; struct iphdr *iph; __be64 key; @@ -66,9 +66,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); + ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY, NULL, 0); - ovs_vport_receive(vport, skb, &tun_key); + ovs_vport_receive(vport, skb, &tun_info); } static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) @@ -140,24 +140,24 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) struct net *net = ovs_dp_get_net(vport->dp); struct vxlan_port *vxlan_port = vxlan_vport(vport); __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + struct ovs_key_ipv4_tunnel *tun_key; struct rtable *rt; struct flowi4 fl; __be16 src_port; - int port_min; - int port_max; __be16 df; int err; - if (unlikely(!OVS_CB(skb)->tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; goto error; } + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; /* Route lookup */ memset(&fl, 0, sizeof(fl)); - fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; - fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; - fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_UDP; @@ -167,20 +167,18 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) goto error; } - df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; skb->ignore_df = 1; - inet_get_local_port_range(net, &port_min, &port_max); - src_port = vxlan_src_port(port_min, port_max, skb); + src_port = udp_flow_src_port(net, skb, 0, 0, true); err = vxlan_xmit_skb(vxlan_port->vs, rt, skb, - fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, - OVS_CB(skb)->tun_key->ipv4_tos, - OVS_CB(skb)->tun_key->ipv4_ttl, df, + fl.saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, tun_key->ipv4_ttl, df, src_port, dst_port, - htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8), + htonl(be64_to_cpu(tun_key->tun_id) << 8), false); if (err < 0) ip_rt_put(rt); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 42c0f4a0b78c..53001b020ca7 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -48,6 +48,9 @@ static const struct vport_ops *vport_ops_list[] = { #ifdef CONFIG_OPENVSWITCH_VXLAN &ovs_vxlan_vport_ops, #endif +#ifdef CONFIG_OPENVSWITCH_GENEVE + &ovs_geneve_vport_ops, +#endif }; /* Protected by RCU read lock for reading, ovs_mutex for writing. */ @@ -134,18 +137,20 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, vport->dp = parms->dp; vport->port_no = parms->port_no; - vport->upcall_portid = parms->upcall_portid; vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); + if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) { + kfree(vport); + return ERR_PTR(-EINVAL); + } + vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vport->percpu_stats) { kfree(vport); return ERR_PTR(-ENOMEM); } - spin_lock_init(&vport->stats_lock); - return vport; } @@ -161,6 +166,10 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, */ void ovs_vport_free(struct vport *vport) { + /* vport is freed from RCU callback or error path, Therefore + * it is safe to use raw dereference. + */ + kfree(rcu_dereference_raw(vport->upcall_portids)); free_percpu(vport->percpu_stats); kfree(vport); } @@ -260,14 +269,10 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) * netdev-stats can be directly read over netlink-ioctl. */ - spin_lock_bh(&vport->stats_lock); - - stats->rx_errors = vport->err_stats.rx_errors; - stats->tx_errors = vport->err_stats.tx_errors; - stats->tx_dropped = vport->err_stats.tx_dropped; - stats->rx_dropped = vport->err_stats.rx_dropped; - - spin_unlock_bh(&vport->stats_lock); + stats->rx_errors = atomic_long_read(&vport->err_stats.rx_errors); + stats->tx_errors = atomic_long_read(&vport->err_stats.tx_errors); + stats->tx_dropped = atomic_long_read(&vport->err_stats.tx_dropped); + stats->rx_dropped = atomic_long_read(&vport->err_stats.rx_dropped); for_each_possible_cpu(i) { const struct pcpu_sw_netstats *percpu_stats; @@ -327,6 +332,99 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) } /** + * ovs_vport_set_upcall_portids - set upcall portids of @vport. + * + * @vport: vport to modify. + * @ids: new configuration, an array of port ids. + * + * Sets the vport's upcall_portids to @ids. + * + * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed + * as an array of U32. + * + * Must be called with ovs_mutex. + */ +int ovs_vport_set_upcall_portids(struct vport *vport, struct nlattr *ids) +{ + struct vport_portids *old, *vport_portids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; + + old = ovsl_dereference(vport->upcall_portids); + + vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), + GFP_KERNEL); + if (!vport_portids) + return -ENOMEM; + + vport_portids->n_ids = nla_len(ids) / sizeof(u32); + vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids); + nla_memcpy(vport_portids->ids, ids, nla_len(ids)); + + rcu_assign_pointer(vport->upcall_portids, vport_portids); + + if (old) + kfree_rcu(old, rcu); + return 0; +} + +/** + * ovs_vport_get_upcall_portids - get the upcall_portids of @vport. + * + * @vport: vport from which to retrieve the portids. + * @skb: sk_buff where portids should be appended. + * + * Retrieves the configuration of the given vport, appending the + * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall + * portids to @skb. + * + * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room. + * If an error occurs, @skb is left unmodified. Must be called with + * ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_upcall_portids(const struct vport *vport, + struct sk_buff *skb) +{ + struct vport_portids *ids; + + ids = rcu_dereference_ovsl(vport->upcall_portids); + + if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) + return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, + ids->n_ids * sizeof(u32), (void *)ids->ids); + else + return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); +} + +/** + * ovs_vport_find_upcall_portid - find the upcall portid to send upcall. + * + * @vport: vport from which the missed packet is received. + * @skb: skb that the missed packet was received. + * + * Uses the skb_get_hash() to select the upcall portid to send the + * upcall. + * + * Returns the portid of the target socket. Must be called with rcu_read_lock. + */ +u32 ovs_vport_find_upcall_portid(const struct vport *p, struct sk_buff *skb) +{ + struct vport_portids *ids; + u32 ids_index; + u32 hash; + + ids = rcu_dereference(p->upcall_portids); + + if (ids->n_ids == 1 && ids->ids[0] == 0) + return 0; + + hash = skb_get_hash(skb); + ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); + return ids->ids[ids_index]; +} + +/** * ovs_vport_receive - pass up received packet to the datapath for processing * * @vport: vport that received the packet @@ -337,9 +435,11 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) * skb->data should point to the Ethernet header. */ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - struct ovs_key_ipv4_tunnel *tun_key) + struct ovs_tunnel_info *tun_info) { struct pcpu_sw_netstats *stats; + struct sw_flow_key key; + int error; stats = this_cpu_ptr(vport->percpu_stats); u64_stats_update_begin(&stats->syncp); @@ -347,8 +447,15 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, stats->rx_bytes += skb->len; u64_stats_update_end(&stats->syncp); - OVS_CB(skb)->tun_key = tun_key; - ovs_dp_process_received_packet(vport, skb); + OVS_CB(skb)->input_vport = vport; + OVS_CB(skb)->egress_tun_info = NULL; + /* Extract flow from 'skb' into 'key'. */ + error = ovs_flow_key_extract(tun_info, skb, &key); + if (unlikely(error)) { + kfree_skb(skb); + return; + } + ovs_dp_process_packet(skb, &key); } /** @@ -394,27 +501,24 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb) static void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type) { - spin_lock(&vport->stats_lock); - switch (err_type) { case VPORT_E_RX_DROPPED: - vport->err_stats.rx_dropped++; + atomic_long_inc(&vport->err_stats.rx_dropped); break; case VPORT_E_RX_ERROR: - vport->err_stats.rx_errors++; + atomic_long_inc(&vport->err_stats.rx_errors); break; case VPORT_E_TX_DROPPED: - vport->err_stats.tx_dropped++; + atomic_long_inc(&vport->err_stats.tx_dropped); break; case VPORT_E_TX_ERROR: - vport->err_stats.tx_errors++; + atomic_long_inc(&vport->err_stats.tx_errors); break; } - spin_unlock(&vport->stats_lock); } static void free_vport_rcu(struct rcu_head *rcu) diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 8d721e62f388..8942125de3a6 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -23,6 +23,7 @@ #include <linux/list.h> #include <linux/netlink.h> #include <linux/openvswitch.h> +#include <linux/reciprocal_div.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/u64_stats_sync.h> @@ -34,7 +35,6 @@ struct vport_parms; /* The following definitions are for users of the vport subsytem: */ -/* The following definitions are for users of the vport subsytem: */ struct vport_net { struct vport __rcu *gre_vport; }; @@ -52,35 +52,52 @@ void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); int ovs_vport_set_options(struct vport *, struct nlattr *options); int ovs_vport_get_options(const struct vport *, struct sk_buff *); +int ovs_vport_set_upcall_portids(struct vport *, struct nlattr *pids); +int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); +u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); + int ovs_vport_send(struct vport *, struct sk_buff *); /* The following definitions are for implementers of vport devices: */ struct vport_err_stats { - u64 rx_dropped; - u64 rx_errors; - u64 tx_dropped; - u64 tx_errors; + atomic_long_t rx_dropped; + atomic_long_t rx_errors; + atomic_long_t tx_dropped; + atomic_long_t tx_errors; +}; +/** + * struct vport_portids - array of netlink portids of a vport. + * must be protected by rcu. + * @rn_ids: The reciprocal value of @n_ids. + * @rcu: RCU callback head for deferred destruction. + * @n_ids: Size of @ids array. + * @ids: Array storing the Netlink socket pids to be used for packets received + * on this port that miss the flow table. + */ +struct vport_portids { + struct reciprocal_value rn_ids; + struct rcu_head rcu; + u32 n_ids; + u32 ids[]; }; /** * struct vport - one port within a datapath * @rcu: RCU callback head for deferred destruction. * @dp: Datapath to which this port belongs. - * @upcall_portid: The Netlink port to use for packets received on this port that - * miss the flow table. + * @upcall_portids: RCU protected 'struct vport_portids'. * @port_no: Index into @dp's @ports array. * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. * @percpu_stats: Points to per-CPU statistics used and maintained by vport - * @stats_lock: Protects @err_stats; * @err_stats: Points to error statistics used and maintained by vport */ struct vport { struct rcu_head rcu; struct datapath *dp; - u32 upcall_portid; + struct vport_portids __rcu *upcall_portids; u16 port_no; struct hlist_node hash_node; @@ -89,7 +106,6 @@ struct vport { struct pcpu_sw_netstats __percpu *percpu_stats; - spinlock_t stats_lock; struct vport_err_stats err_stats; }; @@ -111,7 +127,7 @@ struct vport_parms { /* For ovs_vport_alloc(). */ struct datapath *dp; u16 port_no; - u32 upcall_portid; + struct nlattr *upcall_portids; }; /** @@ -191,7 +207,7 @@ static inline struct vport *vport_from_priv(void *priv) } void ovs_vport_receive(struct vport *, struct sk_buff *, - struct ovs_key_ipv4_tunnel *); + struct ovs_tunnel_info *); /* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the top of vport.c. */ @@ -199,6 +215,7 @@ extern const struct vport_ops ovs_netdev_vport_ops; extern const struct vport_ops ovs_internal_vport_ops; extern const struct vport_ops ovs_gre_vport_ops; extern const struct vport_ops ovs_vxlan_vport_ops; +extern const struct vport_ops ovs_geneve_vport_ops; static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b85c67ccb797..87d20f48ff06 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -240,11 +240,9 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po); static int packet_direct_xmit(struct sk_buff *skb) { struct net_device *dev = skb->dev; - const struct net_device_ops *ops = dev->netdev_ops; netdev_features_t features; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; - u16 queue_map; if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) @@ -255,17 +253,13 @@ static int packet_direct_xmit(struct sk_buff *skb) __skb_linearize(skb)) goto drop; - queue_map = skb_get_queue_mapping(skb); - txq = netdev_get_tx_queue(dev, queue_map); + txq = skb_get_tx_queue(dev, skb); local_bh_disable(); HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_drv_stopped(txq)) { - ret = ops->ndo_start_xmit(skb, dev); - if (ret == NETDEV_TX_OK) - txq_trans_update(txq); - } + if (!netif_xmit_frozen_or_drv_stopped(txq)) + ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); local_bh_enable(); @@ -441,14 +435,10 @@ static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); - if (shhwtstamps) { - if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->syststamp, ts)) - return TP_STATUS_TS_SYS_HARDWARE; - if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) - return TP_STATUS_TS_RAW_HARDWARE; - } + if (shhwtstamps && + (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && + ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) + return TP_STATUS_TS_RAW_HARDWARE; if (ktime_to_timespec_cond(skb->tstamp, ts)) return TP_STATUS_TS_SOFTWARE; @@ -636,6 +626,7 @@ static void init_prb_bdqc(struct packet_sock *po, p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; + p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); prb_setup_retire_blk_timer(po, tx_ring); prb_open_block(p1, pbd); @@ -1946,6 +1937,18 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if ((int)snaplen < 0) snaplen = 0; } + } else if (unlikely(macoff + snaplen > + GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { + u32 nval; + + nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; + pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", + snaplen, nval, macoff); + snaplen = nval; + if (unlikely((int)snaplen < 0)) { + snaplen = 0; + macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; + } } spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_rx_frame(po, skb, @@ -3071,10 +3074,8 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, break; case PACKET_MR_PROMISC: return dev_set_promiscuity(dev, what); - break; case PACKET_MR_ALLMULTI: return dev_set_allmulti(dev, what); - break; case PACKET_MR_UNICAST: if (i->alen != dev->addr_len) return -EINVAL; @@ -3789,6 +3790,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) goto out; + if (po->tp_version >= TPACKET_V3 && + (int)(req->tp_block_size - + BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0) + goto out; if (unlikely(req->tp_frame_size < po->tp_hdrlen + po->tp_reserve)) goto out; diff --git a/net/packet/internal.h b/net/packet/internal.h index eb9580a6b25f..cdddf6a30399 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -29,6 +29,7 @@ struct tpacket_kbdq_core { char *pkblk_start; char *pkblk_end; int kblk_size; + unsigned int max_frame_len; unsigned int knum_blocks; uint64_t knxt_seq_num; char *prev; diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c index 66dc65e7c6a1..e9a83a637185 100644 --- a/net/phonet/pep-gprs.c +++ b/net/phonet/pep-gprs.c @@ -267,7 +267,7 @@ int gprs_attach(struct sock *sk) return -EINVAL; /* need packet boundaries */ /* Create net device */ - dev = alloc_netdev(sizeof(*gp), ifname, gprs_setup); + dev = alloc_netdev(sizeof(*gp), ifname, NET_NAME_UNKNOWN, gprs_setup); if (!dev) return -ENOMEM; gp = netdev_priv(dev); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 56a6146ac94b..a58680016472 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -36,7 +36,7 @@ struct phonet_routes { struct mutex lock; - struct net_device *table[64]; + struct net_device __rcu *table[64]; }; struct phonet_net { @@ -275,7 +275,7 @@ static void phonet_route_autodel(struct net_device *dev) bitmap_zero(deleted, 64); mutex_lock(&pnn->routes.lock); for (i = 0; i < 64; i++) - if (dev == pnn->routes.table[i]) { + if (rcu_access_pointer(pnn->routes.table[i]) == dev) { RCU_INIT_POINTER(pnn->routes.table[i], NULL); set_bit(i, deleted); } @@ -388,7 +388,7 @@ int phonet_route_del(struct net_device *dev, u8 daddr) daddr = daddr >> 2; mutex_lock(&routes->lock); - if (dev == routes->table[daddr]) + if (rcu_access_pointer(routes->table[daddr]) == dev) RCU_INIT_POINTER(routes->table[daddr], NULL); else dev = NULL; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 424ff622ab5f..10443377fb9d 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -83,7 +83,7 @@ static int rds_release(struct socket *sock) /* * the binding lookup hash uses rcu, we need to - * make sure we sychronize_rcu before we free our + * make sure we synchronize_rcu before we free our * entry */ rds_remove_bound(rs); diff --git a/net/rds/send.c b/net/rds/send.c index 23718160d71e..0a64541020b0 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -593,8 +593,11 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status) sock_put(rds_rs_to_sk(rs)); } rs = rm->m_rs; - sock_hold(rds_rs_to_sk(rs)); + if (rs) + sock_hold(rds_rs_to_sk(rs)); } + if (!rs) + goto unlock_and_drop; spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { @@ -638,9 +641,6 @@ unlock_and_drop: * queue. This means that in the TCP case, the message may not have been * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked * checks the RDS_MSG_HAS_ACK_SEQ bit. - * - * XXX It's not clear to me how this is safely serialized with socket - * destruction. Maybe it should bail if it sees SOCK_DEAD. */ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked) @@ -711,6 +711,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { spin_unlock_irqrestore(&conn->c_lock, flags); + spin_lock_irqsave(&rm->m_rs_lock, flags); + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); continue; } list_del_init(&rm->m_conn_item); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index a65ee78db0c5..f9f564a6c960 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -106,11 +106,14 @@ int rds_tcp_conn_connect(struct rds_connection *conn) rds_tcp_set_callbacks(sock, conn); ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), O_NONBLOCK); - sock = NULL; rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; + if (ret == 0) + sock = NULL; + else + rds_tcp_restore_callbacks(sock, conn->c_transport_data); out: if (sock) diff --git a/net/rds/threads.c b/net/rds/threads.c index 65eaefcab241..dc2402e871fd 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -78,8 +78,7 @@ void rds_connect_complete(struct rds_connection *conn) "current state is %d\n", __func__, atomic_read(&conn->c_state)); - atomic_set(&conn->c_state, RDS_CONN_ERROR); - queue_work(rds_wq, &conn->c_down_w); + rds_conn_drop(conn); return; } diff --git a/net/rfkill/core.c b/net/rfkill/core.c index b3b16c070a7f..fa7cd792791c 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -329,7 +329,7 @@ static atomic_t rfkill_input_disabled = ATOMIC_INIT(0); /** * __rfkill_switch_all - Toggle state of all switches of given type * @type: type of interfaces to be affected - * @state: the new state + * @blocked: the new state * * This function sets the state of all switches of given type, * unless a specific switch is claimed by userspace (in which case, @@ -353,7 +353,7 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked) /** * rfkill_switch_all - Toggle state of all switches of given type * @type: type of interfaces to be affected - * @state: the new state + * @blocked: the new state * * Acquires rfkill_global_mutex and calls __rfkill_switch_all(@type, @state). * Please refer to __rfkill_switch_all() for details. diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 14c98e48f261..0f62326c0f5e 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -54,7 +54,7 @@ static int rfkill_gpio_set_power(void *data, bool blocked) if (blocked && !IS_ERR(rfkill->clk) && rfkill->clk_enabled) clk_disable(rfkill->clk); - rfkill->clk_enabled = blocked; + rfkill->clk_enabled = !blocked; return 0; } @@ -158,10 +158,12 @@ static const struct acpi_device_id rfkill_acpi_match[] = { { "BCM2E1A", RFKILL_TYPE_BLUETOOTH }, { "BCM2E39", RFKILL_TYPE_BLUETOOTH }, { "BCM2E3D", RFKILL_TYPE_BLUETOOTH }, + { "BCM2E64", RFKILL_TYPE_BLUETOOTH }, { "BCM4752", RFKILL_TYPE_GPS }, { "LNV4752", RFKILL_TYPE_GPS }, { }, }; +MODULE_DEVICE_TABLE(acpi, rfkill_acpi_match); #endif static struct platform_driver rfkill_gpio_driver = { diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 8451c8cdc9de..a85c1a086ae4 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1538,7 +1538,7 @@ static int __init rose_proto_init(void) char name[IFNAMSIZ]; sprintf(name, "rose%d", i); - dev = alloc_netdev(0, name, rose_setup); + dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, rose_setup); if (!dev) { printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n"); rc = -ENOMEM; diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index bc5514211b0c..e873d7d9f857 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -160,7 +160,8 @@ void rose_link_rx_restart(struct sk_buff *skb, struct rose_neigh *neigh, unsigne break; case ROSE_DIAGNOSTIC: - printk(KERN_WARNING "ROSE: received diagnostic #%d - %02X %02X %02X\n", skb->data[3], skb->data[4], skb->data[5], skb->data[6]); + pr_warn("ROSE: received diagnostic #%d - %3ph\n", skb->data[3], + skb->data + 4); break; default: diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index db57458c824c..74c0fcd36838 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -37,7 +37,7 @@ void rxrpc_UDP_error_report(struct sock *sk) _enter("%p{%d}", sk, local->debug_id); - skb = skb_dequeue(&sk->sk_error_queue); + skb = sock_dequeue_err_skb(sk); if (!skb) { _leave("UDP socket errqueue empty"); return; @@ -111,18 +111,6 @@ void rxrpc_UDP_error_report(struct sock *sk) skb_queue_tail(&trans->error_queue, skb); rxrpc_queue_work(&trans->error_handler); - /* reset and regenerate socket error */ - spin_lock_bh(&sk->sk_error_queue.lock); - sk->sk_err = 0; - skb = skb_peek(&sk->sk_error_queue); - if (skb) { - sk->sk_err = SKB_EXT_ERR(skb)->ee.ee_errno; - spin_unlock_bh(&sk->sk_error_queue.lock); - sk->sk_error_report(sk); - } else { - spin_unlock_bh(&sk->sk_error_queue.lock); - } - _leave(""); } diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 63b21e580de9..481f89f93789 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -45,7 +45,7 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp; struct rxrpc_sock *rx = call->socket; struct sock *sk; - int skb_len, ret; + int ret; _enter(",,%d,%d", force, terminal); @@ -101,13 +101,6 @@ int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, rx->interceptor(sk, call->user_call_ID, skb); spin_unlock_bh(&sk->sk_receive_queue.lock); } else { - - /* Cache the SKB length before we tack it onto the - * receive queue. Once it is added it no longer - * belongs to us and may be freed by other threads of - * control pulling packets from the queue */ - skb_len = skb->len; - _net("post skb %p", skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock_bh(&sk->sk_receive_queue.lock); diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 10c6cb694b43..db0f39f5ef96 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -348,7 +348,7 @@ static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td, n_elem = ntohl(*xdr++); toklen -= 4; - if (n_elem < 0 || n_elem > max_n_elem) + if (n_elem > max_n_elem) return -EINVAL; *_n_elem = n_elem; if (n_elem > 0) { @@ -1141,7 +1141,7 @@ static long rxrpc_read(const struct key *key, if (copy_to_user(xdr, (s), _l) != 0) \ goto fault; \ if (_l & 3 && \ - copy_to_user((u8 *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \ + copy_to_user((u8 __user *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \ goto fault; \ xdr += (_l + 3) >> 2; \ } while(0) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 648778aef1a2..3d43e4979f27 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -252,7 +252,8 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { - int err = gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est, + int err = gen_new_estimator(&p->tcfc_bstats, NULL, + &p->tcfc_rate_est, &p->tcfc_lock, est); if (err) { kfree(p); @@ -619,10 +620,12 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (err < 0) goto errout; - if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 || + if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || - gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0) + gnet_stats_copy_queue(&d, NULL, + &p->tcfc_qstats, + p->tcfc_qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 4f912c0e225b..eb48306033d9 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -218,10 +218,12 @@ static int mirred_device_event(struct notifier_block *unused, if (event == NETDEV_UNREGISTER) list_for_each_entry(m, &mirred_list, tcfm_list) { + spin_lock_bh(&m->tcf_lock); if (m->tcfm_dev == dev) { dev_put(dev); m->tcfm_dev = NULL; } + spin_unlock_bh(&m->tcf_lock); } return NOTIFY_DONE; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0566e4606a4a..69791ca77a05 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -178,7 +178,7 @@ override: spin_lock_bh(&police->tcf_lock); if (est) { - err = gen_replace_estimator(&police->tcf_bstats, + err = gen_replace_estimator(&police->tcf_bstats, NULL, &police->tcf_rate_est, &police->tcf_lock, est); if (err) @@ -231,7 +231,7 @@ override: if (ret != ACT_P_CREATED) return ret; - police->tcfp_t_c = ktime_to_ns(ktime_get()); + police->tcfp_t_c = ktime_get_ns(); police->tcf_index = parm->index ? parm->index : tcf_hash_new_index(hinfo); h = tcf_hash(police->tcf_index, POL_TAB_MASK); @@ -279,7 +279,7 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, return police->tcfp_result; } - now = ktime_to_ns(ktime_get()); + now = ktime_get_ns(); toks = min_t(s64, now - police->tcfp_t_c, police->tcfp_burst); if (police->peak_present) { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 45527e6b52db..aad6a679fb13 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -117,7 +117,6 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; - spinlock_t *root_lock; struct tcmsg *t; u32 protocol; u32 prio; @@ -125,7 +124,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) u32 parent; struct net_device *dev; struct Qdisc *q; - struct tcf_proto **back, **chain; + struct tcf_proto __rcu **back; + struct tcf_proto __rcu **chain; struct tcf_proto *tp; const struct tcf_proto_ops *tp_ops; const struct Qdisc_class_ops *cops; @@ -197,7 +197,9 @@ replay: goto errout; /* Check the chain for existence of proto-tcf with this priority */ - for (back = chain; (tp = *back) != NULL; back = &tp->next) { + for (back = chain; + (tp = rtnl_dereference(*back)) != NULL; + back = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { if (!nprio || @@ -209,8 +211,6 @@ replay: } } - root_lock = qdisc_root_sleeping_lock(q); - if (tp == NULL) { /* Proto-tcf does not exist, create new one */ @@ -259,7 +259,8 @@ replay: } tp->ops = tp_ops; tp->protocol = protocol; - tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back)); + tp->prio = nprio ? : + TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back))); tp->q = q; tp->classify = tp_ops->classify; tp->classid = parent; @@ -280,9 +281,9 @@ replay: if (fh == 0) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { - spin_lock_bh(root_lock); - *back = tp->next; - spin_unlock_bh(root_lock); + struct tcf_proto *next = rtnl_dereference(tp->next); + + RCU_INIT_POINTER(*back, next); tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); tcf_destroy(tp); @@ -322,10 +323,8 @@ replay: n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE); if (err == 0) { if (tp_created) { - spin_lock_bh(root_lock); - tp->next = *back; - *back = tp; - spin_unlock_bh(root_lock); + RCU_INIT_POINTER(tp->next, rtnl_dereference(*back)); + rcu_assign_pointer(*back, tp); } tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); } else { @@ -420,7 +419,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) int s_t; struct net_device *dev; struct Qdisc *q; - struct tcf_proto *tp, **chain; + struct tcf_proto *tp, __rcu **chain; struct tcmsg *tcm = nlmsg_data(cb->nlh); unsigned long cl = 0; const struct Qdisc_class_ops *cops; @@ -454,7 +453,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; - for (tp = *chain, t = 0; tp; tp = tp->next, t++) { + for (tp = rtnl_dereference(*chain), t = 0; + tp; tp = rtnl_dereference(tp->next), t++) { if (t < s_t) continue; if (TC_H_MAJ(tcm->tcm_info) && @@ -496,7 +496,7 @@ out: return skb->len; } -void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) +void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND); @@ -549,6 +549,7 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, tcf_tree_lock(tp); list_splice_init(&dst->actions, &tmp); list_splice(&src->actions, &dst->actions); + dst->type = src->type; tcf_tree_unlock(tp); tcf_action_destroy(&tmp, TCA_ACT_UNBIND); #endif @@ -561,13 +562,14 @@ EXPORT_SYMBOL(tcf_exts_change); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT + struct nlattr *nest; + if (exts->action && !list_empty(&exts->actions)) { /* * again for backward compatible mode - we want * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ - struct nlattr *nest; if (exts->type != TCA_OLD_COMPAT) { nest = nla_nest_start(skb, exts->action); if (nest == NULL) @@ -585,10 +587,14 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) nla_nest_end(skb, nest); } } -#endif return 0; -nla_put_failure: __attribute__ ((unused)) + +nla_put_failure: + nla_nest_cancel(skb, nest); return -1; +#else + return 0; +#endif } EXPORT_SYMBOL(tcf_exts_dump); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 0ae1813e3e90..cd61280941e5 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -24,6 +24,7 @@ struct basic_head { u32 hgenerator; struct list_head flist; + struct rcu_head rcu; }; struct basic_filter { @@ -31,17 +32,19 @@ struct basic_filter { struct tcf_exts exts; struct tcf_ematch_tree ematches; struct tcf_result res; + struct tcf_proto *tp; struct list_head link; + struct rcu_head rcu; }; static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { int r; - struct basic_head *head = tp->root; + struct basic_head *head = rcu_dereference_bh(tp->root); struct basic_filter *f; - list_for_each_entry(f, &head->flist, link) { + list_for_each_entry_rcu(f, &head->flist, link) { if (!tcf_em_tree_match(skb, &f->ematches, NULL)) continue; *res = f->res; @@ -56,7 +59,7 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long basic_get(struct tcf_proto *tp, u32 handle) { unsigned long l = 0UL; - struct basic_head *head = tp->root; + struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f; if (head == NULL) @@ -81,41 +84,43 @@ static int basic_init(struct tcf_proto *tp) if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->flist); - tp->root = head; + rcu_assign_pointer(tp->root, head); return 0; } -static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f) +static void basic_delete_filter(struct rcu_head *head) { - tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); - tcf_em_tree_destroy(tp, &f->ematches); + struct basic_filter *f = container_of(head, struct basic_filter, rcu); + + tcf_exts_destroy(&f->exts); + tcf_em_tree_destroy(&f->ematches); kfree(f); } static void basic_destroy(struct tcf_proto *tp) { - struct basic_head *head = tp->root; + struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f, *n; list_for_each_entry_safe(f, n, &head->flist, link) { - list_del(&f->link); - basic_delete_filter(tp, f); + list_del_rcu(&f->link); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, basic_delete_filter); } - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); } static int basic_delete(struct tcf_proto *tp, unsigned long arg) { - struct basic_head *head = tp->root; + struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *t, *f = (struct basic_filter *) arg; list_for_each_entry(t, &head->flist, link) if (t == f) { - tcf_tree_lock(tp); - list_del(&t->link); - tcf_tree_unlock(tp); - basic_delete_filter(tp, t); + list_del_rcu(&t->link); + tcf_unbind_filter(tp, &t->res); + call_rcu(&t->rcu, basic_delete_filter); return 0; } @@ -152,10 +157,11 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, tcf_exts_change(tp, &f->exts, &e); tcf_em_tree_change(tp, &f->ematches, &t); + f->tp = tp; return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } @@ -164,9 +170,10 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, struct nlattr **tca, unsigned long *arg, bool ovr) { int err; - struct basic_head *head = tp->root; + struct basic_head *head = rtnl_dereference(tp->root); struct nlattr *tb[TCA_BASIC_MAX + 1]; - struct basic_filter *f = (struct basic_filter *) *arg; + struct basic_filter *fold = (struct basic_filter *) *arg; + struct basic_filter *fnew; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; @@ -176,22 +183,23 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - if (f != NULL) { - if (handle && f->handle != handle) + if (fold != NULL) { + if (handle && fold->handle != handle) return -EINVAL; - return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); } err = -ENOBUFS; - f = kzalloc(sizeof(*f), GFP_KERNEL); - if (f == NULL) + fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + if (fnew == NULL) goto errout; - tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); + tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); err = -EINVAL; - if (handle) - f->handle = handle; - else { + if (handle) { + fnew->handle = handle; + } else if (fold) { + fnew->handle = fold->handle; + } else { unsigned int i = 0x80000000; do { if (++head->hgenerator == 0x7FFFFFFF) @@ -203,29 +211,32 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, goto errout; } - f->handle = head->hgenerator; + fnew->handle = head->hgenerator; } - err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); + err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr); if (err < 0) goto errout; - tcf_tree_lock(tp); - list_add(&f->link, &head->flist); - tcf_tree_unlock(tp); - *arg = (unsigned long) f; + *arg = (unsigned long)fnew; + + if (fold) { + list_replace_rcu(&fold->link, &fnew->link); + tcf_unbind_filter(tp, &fold->res); + call_rcu(&fold->rcu, basic_delete_filter); + } else { + list_add_rcu(&fnew->link, &head->flist); + } return 0; errout: - if (*arg == 0UL && f) - kfree(f); - + kfree(fnew); return err; } static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct basic_head *head = tp->root; + struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f; list_for_each_entry(f, &head->flist, link) { diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 13f64df2c710..eed49d1d0878 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -27,16 +27,19 @@ MODULE_DESCRIPTION("TC BPF based classifier"); struct cls_bpf_head { struct list_head plist; u32 hgen; + struct rcu_head rcu; }; struct cls_bpf_prog { - struct sk_filter *filter; + struct bpf_prog *filter; struct sock_filter *bpf_ops; struct tcf_exts exts; struct tcf_result res; struct list_head link; u32 handle; u16 bpf_len; + struct tcf_proto *tp; + struct rcu_head rcu; }; static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { @@ -49,12 +52,12 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct cls_bpf_head *head = tp->root; + struct cls_bpf_head *head = rcu_dereference_bh(tp->root); struct cls_bpf_prog *prog; int ret; - list_for_each_entry(prog, &head->plist, link) { - int filter_res = SK_RUN_FILTER(prog->filter, skb); + list_for_each_entry_rcu(prog, &head->plist, link) { + int filter_res = BPF_PROG_RUN(prog->filter, skb); if (filter_res == 0) continue; @@ -81,35 +84,39 @@ static int cls_bpf_init(struct tcf_proto *tp) if (head == NULL) return -ENOBUFS; - INIT_LIST_HEAD(&head->plist); - tp->root = head; + INIT_LIST_HEAD_RCU(&head->plist); + rcu_assign_pointer(tp->root, head); return 0; } static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) { - tcf_unbind_filter(tp, &prog->res); - tcf_exts_destroy(tp, &prog->exts); + tcf_exts_destroy(&prog->exts); - sk_unattached_filter_destroy(prog->filter); + bpf_prog_destroy(prog->filter); kfree(prog->bpf_ops); kfree(prog); } +static void __cls_bpf_delete_prog(struct rcu_head *rcu) +{ + struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu); + + cls_bpf_delete_prog(prog->tp, prog); +} + static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) { - struct cls_bpf_head *head = tp->root; + struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg; list_for_each_entry(prog, &head->plist, link) { if (prog == todel) { - tcf_tree_lock(tp); - list_del(&prog->link); - tcf_tree_unlock(tp); - - cls_bpf_delete_prog(tp, prog); + list_del_rcu(&prog->link); + tcf_unbind_filter(tp, &prog->res); + call_rcu(&prog->rcu, __cls_bpf_delete_prog); return 0; } } @@ -119,27 +126,29 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) static void cls_bpf_destroy(struct tcf_proto *tp) { - struct cls_bpf_head *head = tp->root; + struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog, *tmp; list_for_each_entry_safe(prog, tmp, &head->plist, link) { - list_del(&prog->link); - cls_bpf_delete_prog(tp, prog); + list_del_rcu(&prog->link); + tcf_unbind_filter(tp, &prog->res); + call_rcu(&prog->rcu, __cls_bpf_delete_prog); } - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); } static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) { - struct cls_bpf_head *head = tp->root; + struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog; unsigned long ret = 0UL; if (head == NULL) return 0UL; - list_for_each_entry(prog, &head->plist, link) { + list_for_each_entry_rcu(prog, &head->plist, link) { if (prog->handle == handle) { ret = (unsigned long) prog; break; @@ -158,10 +167,10 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, unsigned long base, struct nlattr **tb, struct nlattr *est, bool ovr) { - struct sock_filter *bpf_ops, *bpf_old; + struct sock_filter *bpf_ops; struct tcf_exts exts; struct sock_fprog_kern tmp; - struct sk_filter *fp, *fp_old; + struct bpf_prog *fp; u16 bpf_size, bpf_len; u32 classid; int ret; @@ -193,34 +202,23 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, tmp.len = bpf_len; tmp.filter = bpf_ops; - ret = sk_unattached_filter_create(&fp, &tmp); + ret = bpf_prog_create(&fp, &tmp); if (ret) goto errout_free; - tcf_tree_lock(tp); - fp_old = prog->filter; - bpf_old = prog->bpf_ops; - prog->bpf_len = bpf_len; prog->bpf_ops = bpf_ops; prog->filter = fp; prog->res.classid = classid; - tcf_tree_unlock(tp); tcf_bind_filter(tp, &prog->res, base); tcf_exts_change(tp, &prog->exts, &exts); - if (fp_old) - sk_unattached_filter_destroy(fp_old); - if (bpf_old) - kfree(bpf_old); - return 0; - errout_free: kfree(bpf_ops); errout: - tcf_exts_destroy(tp, &exts); + tcf_exts_destroy(&exts); return ret; } @@ -244,9 +242,10 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, u32 handle, struct nlattr **tca, unsigned long *arg, bool ovr) { - struct cls_bpf_head *head = tp->root; - struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg; + struct cls_bpf_head *head = rtnl_dereference(tp->root); + struct cls_bpf_prog *oldprog = (struct cls_bpf_prog *) *arg; struct nlattr *tb[TCA_BPF_MAX + 1]; + struct cls_bpf_prog *prog; int ret; if (tca[TCA_OPTIONS] == NULL) @@ -256,18 +255,19 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (ret < 0) return ret; - if (prog != NULL) { - if (handle && prog->handle != handle) - return -EINVAL; - return cls_bpf_modify_existing(net, tp, prog, base, tb, - tca[TCA_RATE], ovr); - } - prog = kzalloc(sizeof(*prog), GFP_KERNEL); - if (prog == NULL) + if (!prog) return -ENOBUFS; tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); + + if (oldprog) { + if (handle && oldprog->handle != handle) { + ret = -EINVAL; + goto errout; + } + } + if (handle == 0) prog->handle = cls_bpf_grab_new_handle(tp, head); else @@ -281,16 +281,18 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (ret < 0) goto errout; - tcf_tree_lock(tp); - list_add(&prog->link, &head->plist); - tcf_tree_unlock(tp); + if (oldprog) { + list_replace_rcu(&prog->link, &oldprog->link); + tcf_unbind_filter(tp, &oldprog->res); + call_rcu(&oldprog->rcu, __cls_bpf_delete_prog); + } else { + list_add_rcu(&prog->link, &head->plist); + } *arg = (unsigned long) prog; - return 0; errout: - if (*arg == 0UL && prog) - kfree(prog); + kfree(prog); return ret; } @@ -339,10 +341,10 @@ nla_put_failure: static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct cls_bpf_head *head = tp->root; + struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog; - list_for_each_entry(prog, &head->plist, link) { + list_for_each_entry_rcu(prog, &head->plist, link) { if (arg->count < arg->skip) goto skip; if (arg->fn(tp, (unsigned long) prog, arg) < 0) { diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index cacf01bd04f0..d61a801222c1 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -22,17 +22,17 @@ struct cls_cgroup_head { u32 handle; struct tcf_exts exts; struct tcf_ematch_tree ematches; + struct tcf_proto *tp; + struct rcu_head rcu; }; static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct cls_cgroup_head *head = tp->root; + struct cls_cgroup_head *head = rcu_dereference_bh(tp->root); u32 classid; - rcu_read_lock(); classid = task_cls_state(current)->classid; - rcu_read_unlock(); /* * Due to the nature of the classifier it is required to ignore all @@ -80,13 +80,25 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, }; +static void cls_cgroup_destroy_rcu(struct rcu_head *root) +{ + struct cls_cgroup_head *head = container_of(root, + struct cls_cgroup_head, + rcu); + + tcf_exts_destroy(&head->exts); + tcf_em_tree_destroy(&head->ematches); + kfree(head); +} + static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg, bool ovr) { struct nlattr *tb[TCA_CGROUP_MAX + 1]; - struct cls_cgroup_head *head = tp->root; + struct cls_cgroup_head *head = rtnl_dereference(tp->root); + struct cls_cgroup_head *new; struct tcf_ematch_tree t; struct tcf_exts e; int err; @@ -94,53 +106,58 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (!tca[TCA_OPTIONS]) return -EINVAL; - if (head == NULL) { - if (!handle) - return -EINVAL; - - head = kzalloc(sizeof(*head), GFP_KERNEL); - if (head == NULL) - return -ENOBUFS; + if (!head && !handle) + return -EINVAL; - tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); - head->handle = handle; + if (head && handle != head->handle) + return -ENOENT; - tcf_tree_lock(tp); - tp->root = head; - tcf_tree_unlock(tp); - } + new = kzalloc(sizeof(*head), GFP_KERNEL); + if (!new) + return -ENOBUFS; - if (handle != head->handle) - return -ENOENT; + tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + if (head) + new->handle = head->handle; + else + new->handle = handle; + new->tp = tp; err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], cgroup_policy); if (err < 0) - return err; + goto errout; tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) - return err; + goto errout; err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t); - if (err < 0) - return err; + if (err < 0) { + tcf_exts_destroy(&e); + goto errout; + } - tcf_exts_change(tp, &head->exts, &e); - tcf_em_tree_change(tp, &head->ematches, &t); + tcf_exts_change(tp, &new->exts, &e); + tcf_em_tree_change(tp, &new->ematches, &t); + rcu_assign_pointer(tp->root, new); + if (head) + call_rcu(&head->rcu, cls_cgroup_destroy_rcu); return 0; +errout: + kfree(new); + return err; } static void cls_cgroup_destroy(struct tcf_proto *tp) { - struct cls_cgroup_head *head = tp->root; + struct cls_cgroup_head *head = rtnl_dereference(tp->root); if (head) { - tcf_exts_destroy(tp, &head->exts); - tcf_em_tree_destroy(tp, &head->ematches); - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + call_rcu(&head->rcu, cls_cgroup_destroy_rcu); } } @@ -151,7 +168,7 @@ static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg) static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct cls_cgroup_head *head = tp->root; + struct cls_cgroup_head *head = rtnl_dereference(tp->root); if (arg->count < arg->skip) goto skip; @@ -167,7 +184,7 @@ skip: static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct cls_cgroup_head *head = tp->root; + struct cls_cgroup_head *head = rtnl_dereference(tp->root); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 35be16f7c192..4ac515f2a6ce 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -34,12 +34,14 @@ struct flow_head { struct list_head filters; + struct rcu_head rcu; }; struct flow_filter { struct list_head list; struct tcf_exts exts; struct tcf_ematch_tree ematches; + struct tcf_proto *tp; struct timer_list perturb_timer; u32 perturb_period; u32 handle; @@ -54,6 +56,7 @@ struct flow_filter { u32 divisor; u32 baseclass; u32 hashrnd; + struct rcu_head rcu; }; static inline u32 addr_fold(void *addr) @@ -276,14 +279,14 @@ static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct flow_head *head = tp->root; + struct flow_head *head = rcu_dereference_bh(tp->root); struct flow_filter *f; u32 keymask; u32 classid; unsigned int n, key; int r; - list_for_each_entry(f, &head->filters, list) { + list_for_each_entry_rcu(f, &head->filters, list) { u32 keys[FLOW_KEY_MAX + 1]; struct flow_keys flow_keys; @@ -346,13 +349,23 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, }; +static void flow_destroy_filter(struct rcu_head *head) +{ + struct flow_filter *f = container_of(head, struct flow_filter, rcu); + + del_timer_sync(&f->perturb_timer); + tcf_exts_destroy(&f->exts); + tcf_em_tree_destroy(&f->ematches); + kfree(f); +} + static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg, bool ovr) { - struct flow_head *head = tp->root; - struct flow_filter *f; + struct flow_head *head = rtnl_dereference(tp->root); + struct flow_filter *fold, *fnew; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FLOW_MAX + 1]; struct tcf_exts e; @@ -401,20 +414,42 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto err1; - f = (struct flow_filter *)*arg; - if (f != NULL) { + err = -ENOBUFS; + fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + if (!fnew) + goto err2; + + fold = (struct flow_filter *)*arg; + if (fold) { err = -EINVAL; - if (f->handle != handle && handle) + if (fold->handle != handle && handle) goto err2; - mode = f->mode; + /* Copy fold into fnew */ + fnew->handle = fold->handle; + fnew->keymask = fold->keymask; + fnew->tp = fold->tp; + + fnew->handle = fold->handle; + fnew->nkeys = fold->nkeys; + fnew->keymask = fold->keymask; + fnew->mode = fold->mode; + fnew->mask = fold->mask; + fnew->xor = fold->xor; + fnew->rshift = fold->rshift; + fnew->addend = fold->addend; + fnew->divisor = fold->divisor; + fnew->baseclass = fold->baseclass; + fnew->hashrnd = fold->hashrnd; + + mode = fold->mode; if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) goto err2; if (mode == FLOW_MODE_HASH) - perturb_period = f->perturb_period; + perturb_period = fold->perturb_period; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) goto err2; @@ -444,83 +479,72 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (TC_H_MIN(baseclass) == 0) baseclass = TC_H_MAKE(baseclass, 1); - err = -ENOBUFS; - f = kzalloc(sizeof(*f), GFP_KERNEL); - if (f == NULL) - goto err2; - - f->handle = handle; - f->mask = ~0U; - tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); - - get_random_bytes(&f->hashrnd, 4); - f->perturb_timer.function = flow_perturbation; - f->perturb_timer.data = (unsigned long)f; - init_timer_deferrable(&f->perturb_timer); + fnew->handle = handle; + fnew->mask = ~0U; + fnew->tp = tp; + get_random_bytes(&fnew->hashrnd, 4); + tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); } - tcf_exts_change(tp, &f->exts, &e); - tcf_em_tree_change(tp, &f->ematches, &t); + fnew->perturb_timer.function = flow_perturbation; + fnew->perturb_timer.data = (unsigned long)fnew; + init_timer_deferrable(&fnew->perturb_timer); - tcf_tree_lock(tp); + tcf_exts_change(tp, &fnew->exts, &e); + tcf_em_tree_change(tp, &fnew->ematches, &t); + + netif_keep_dst(qdisc_dev(tp->q)); if (tb[TCA_FLOW_KEYS]) { - f->keymask = keymask; - f->nkeys = nkeys; + fnew->keymask = keymask; + fnew->nkeys = nkeys; } - f->mode = mode; + fnew->mode = mode; if (tb[TCA_FLOW_MASK]) - f->mask = nla_get_u32(tb[TCA_FLOW_MASK]); + fnew->mask = nla_get_u32(tb[TCA_FLOW_MASK]); if (tb[TCA_FLOW_XOR]) - f->xor = nla_get_u32(tb[TCA_FLOW_XOR]); + fnew->xor = nla_get_u32(tb[TCA_FLOW_XOR]); if (tb[TCA_FLOW_RSHIFT]) - f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]); + fnew->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]); if (tb[TCA_FLOW_ADDEND]) - f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]); + fnew->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]); if (tb[TCA_FLOW_DIVISOR]) - f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]); + fnew->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]); if (baseclass) - f->baseclass = baseclass; + fnew->baseclass = baseclass; - f->perturb_period = perturb_period; - del_timer(&f->perturb_timer); + fnew->perturb_period = perturb_period; if (perturb_period) - mod_timer(&f->perturb_timer, jiffies + perturb_period); + mod_timer(&fnew->perturb_timer, jiffies + perturb_period); if (*arg == 0) - list_add_tail(&f->list, &head->filters); + list_add_tail_rcu(&fnew->list, &head->filters); + else + list_replace_rcu(&fnew->list, &fold->list); - tcf_tree_unlock(tp); + *arg = (unsigned long)fnew; - *arg = (unsigned long)f; + if (fold) + call_rcu(&fold->rcu, flow_destroy_filter); return 0; err2: - tcf_em_tree_destroy(tp, &t); + tcf_em_tree_destroy(&t); + kfree(fnew); err1: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } -static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f) -{ - del_timer_sync(&f->perturb_timer); - tcf_exts_destroy(tp, &f->exts); - tcf_em_tree_destroy(tp, &f->ematches); - kfree(f); -} - static int flow_delete(struct tcf_proto *tp, unsigned long arg) { struct flow_filter *f = (struct flow_filter *)arg; - tcf_tree_lock(tp); - list_del(&f->list); - tcf_tree_unlock(tp); - flow_destroy_filter(tp, f); + list_del_rcu(&f->list); + call_rcu(&f->rcu, flow_destroy_filter); return 0; } @@ -532,28 +556,29 @@ static int flow_init(struct tcf_proto *tp) if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->filters); - tp->root = head; + rcu_assign_pointer(tp->root, head); return 0; } static void flow_destroy(struct tcf_proto *tp) { - struct flow_head *head = tp->root; + struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f, *next; list_for_each_entry_safe(f, next, &head->filters, list) { - list_del(&f->list); - flow_destroy_filter(tp, f); + list_del_rcu(&f->list); + call_rcu(&f->rcu, flow_destroy_filter); } - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); } static unsigned long flow_get(struct tcf_proto *tp, u32 handle) { - struct flow_head *head = tp->root; + struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f; - list_for_each_entry(f, &head->filters, list) + list_for_each_entry_rcu(f, &head->filters, list) if (f->handle == handle) return (unsigned long)f; return 0; @@ -626,10 +651,10 @@ nla_put_failure: static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct flow_head *head = tp->root; + struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f; - list_for_each_entry(f, &head->filters, list) { + list_for_each_entry_rcu(f, &head->filters, list) { if (arg->count < arg->skip) goto skip; if (arg->fn(tp, (unsigned long)f, arg) < 0) { diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 861b03ccfed0..dbfdfd1f1a9f 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -33,17 +33,20 @@ struct fw_head { u32 mask; - struct fw_filter *ht[HTSIZE]; + struct fw_filter __rcu *ht[HTSIZE]; + struct rcu_head rcu; }; struct fw_filter { - struct fw_filter *next; + struct fw_filter __rcu *next; u32 id; struct tcf_result res; #ifdef CONFIG_NET_CLS_IND int ifindex; #endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; + struct tcf_proto *tp; + struct rcu_head rcu; }; static u32 fw_hash(u32 handle) @@ -56,14 +59,16 @@ static u32 fw_hash(u32 handle) static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct fw_head *head = tp->root; + struct fw_head *head = rcu_dereference_bh(tp->root); struct fw_filter *f; int r; u32 id = skb->mark; if (head != NULL) { id &= head->mask; - for (f = head->ht[fw_hash(id)]; f; f = f->next) { + + for (f = rcu_dereference_bh(head->ht[fw_hash(id)]); f; + f = rcu_dereference_bh(f->next)) { if (f->id == id) { *res = f->res; #ifdef CONFIG_NET_CLS_IND @@ -92,13 +97,14 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long fw_get(struct tcf_proto *tp, u32 handle) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; if (head == NULL) return 0; - for (f = head->ht[fw_hash(handle)]; f; f = f->next) { + f = rtnl_dereference(head->ht[fw_hash(handle)]); + for (; f; f = rtnl_dereference(f->next)) { if (f->id == handle) return (unsigned long)f; } @@ -114,16 +120,17 @@ static int fw_init(struct tcf_proto *tp) return 0; } -static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) +static void fw_delete_filter(struct rcu_head *head) { - tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); + struct fw_filter *f = container_of(head, struct fw_filter, rcu); + + tcf_exts_destroy(&f->exts); kfree(f); } static void fw_destroy(struct tcf_proto *tp) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; int h; @@ -131,29 +138,35 @@ static void fw_destroy(struct tcf_proto *tp) return; for (h = 0; h < HTSIZE; h++) { - while ((f = head->ht[h]) != NULL) { - head->ht[h] = f->next; - fw_delete_filter(tp, f); + while ((f = rtnl_dereference(head->ht[h])) != NULL) { + RCU_INIT_POINTER(head->ht[h], + rtnl_dereference(f->next)); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, fw_delete_filter); } } - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); } static int fw_delete(struct tcf_proto *tp, unsigned long arg) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = (struct fw_filter *)arg; - struct fw_filter **fp; + struct fw_filter __rcu **fp; + struct fw_filter *pfp; if (head == NULL || f == NULL) goto out; - for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { - if (*fp == f) { - tcf_tree_lock(tp); - *fp = f->next; - tcf_tree_unlock(tp); - fw_delete_filter(tp, f); + fp = &head->ht[fw_hash(f->id)]; + + for (pfp = rtnl_dereference(*fp); pfp; + fp = &pfp->next, pfp = rtnl_dereference(*fp)) { + if (pfp == f) { + RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, fw_delete_filter); return 0; } } @@ -171,7 +184,7 @@ static int fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct tcf_exts e; u32 mask; int err; @@ -210,7 +223,7 @@ fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } @@ -220,7 +233,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, struct nlattr **tca, unsigned long *arg, bool ovr) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = (struct fw_filter *) *arg; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FW_MAX + 1]; @@ -233,10 +246,45 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - if (f != NULL) { + if (f) { + struct fw_filter *pfp, *fnew; + struct fw_filter __rcu **fp; + if (f->id != handle && handle) return -EINVAL; - return fw_change_attrs(net, tp, f, tb, tca, base, ovr); + + fnew = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); + if (!fnew) + return -ENOBUFS; + + fnew->id = f->id; + fnew->res = f->res; +#ifdef CONFIG_NET_CLS_IND + fnew->ifindex = f->ifindex; +#endif /* CONFIG_NET_CLS_IND */ + fnew->tp = f->tp; + + tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE); + + err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr); + if (err < 0) { + kfree(fnew); + return err; + } + + fp = &head->ht[fw_hash(fnew->id)]; + for (pfp = rtnl_dereference(*fp); pfp; + fp = &pfp->next, pfp = rtnl_dereference(*fp)) + if (pfp == f) + break; + + RCU_INIT_POINTER(fnew->next, rtnl_dereference(pfp->next)); + rcu_assign_pointer(*fp, fnew); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, fw_delete_filter); + + *arg = (unsigned long)fnew; + return err; } if (!handle) @@ -252,9 +300,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, return -ENOBUFS; head->mask = mask; - tcf_tree_lock(tp); - tp->root = head; - tcf_tree_unlock(tp); + rcu_assign_pointer(tp->root, head); } f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); @@ -263,15 +309,14 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); f->id = handle; + f->tp = tp; err = fw_change_attrs(net, tp, f, tb, tca, base, ovr); if (err < 0) goto errout; - f->next = head->ht[fw_hash(handle)]; - tcf_tree_lock(tp); - head->ht[fw_hash(handle)] = f; - tcf_tree_unlock(tp); + RCU_INIT_POINTER(f->next, head->ht[fw_hash(handle)]); + rcu_assign_pointer(head->ht[fw_hash(handle)], f); *arg = (unsigned long)f; return 0; @@ -283,7 +328,7 @@ errout: static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); int h; if (head == NULL) @@ -295,7 +340,8 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) for (h = 0; h < HTSIZE; h++) { struct fw_filter *f; - for (f = head->ht[h]; f; f = f->next) { + for (f = rtnl_dereference(head->ht[h]); f; + f = rtnl_dereference(f->next)) { if (arg->count < arg->skip) { arg->count++; continue; @@ -312,7 +358,7 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct fw_head *head = tp->root; + struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = (struct fw_filter *)fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index dd9fc2523c76..109a329b7198 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -29,25 +29,26 @@ * are mutually exclusive. * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" */ - struct route4_fastmap { - struct route4_filter *filter; - u32 id; - int iif; + struct route4_filter *filter; + u32 id; + int iif; }; struct route4_head { - struct route4_fastmap fastmap[16]; - struct route4_bucket *table[256 + 1]; + struct route4_fastmap fastmap[16]; + struct route4_bucket __rcu *table[256 + 1]; + struct rcu_head rcu; }; struct route4_bucket { /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ - struct route4_filter *ht[16 + 16 + 1]; + struct route4_filter __rcu *ht[16 + 16 + 1]; + struct rcu_head rcu; }; struct route4_filter { - struct route4_filter *next; + struct route4_filter __rcu *next; u32 id; int iif; @@ -55,6 +56,8 @@ struct route4_filter { struct tcf_exts exts; u32 handle; struct route4_bucket *bkt; + struct tcf_proto *tp; + struct rcu_head rcu; }; #define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) @@ -64,14 +67,13 @@ static inline int route4_fastmap_hash(u32 id, int iif) return id & 0xF; } +static DEFINE_SPINLOCK(fastmap_lock); static void -route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id) +route4_reset_fastmap(struct route4_head *head) { - spinlock_t *root_lock = qdisc_root_sleeping_lock(q); - - spin_lock_bh(root_lock); + spin_lock_bh(&fastmap_lock); memset(head->fastmap, 0, sizeof(head->fastmap)); - spin_unlock_bh(root_lock); + spin_unlock_bh(&fastmap_lock); } static void @@ -80,9 +82,12 @@ route4_set_fastmap(struct route4_head *head, u32 id, int iif, { int h = route4_fastmap_hash(id, iif); + /* fastmap updates must look atomic to aling id, iff, filter */ + spin_lock_bh(&fastmap_lock); head->fastmap[h].id = id; head->fastmap[h].iif = iif; head->fastmap[h].filter = f; + spin_unlock_bh(&fastmap_lock); } static inline int route4_hash_to(u32 id) @@ -123,7 +128,7 @@ static inline int route4_hash_wild(void) static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct route4_head *head = tp->root; + struct route4_head *head = rcu_dereference_bh(tp->root); struct dst_entry *dst; struct route4_bucket *b; struct route4_filter *f; @@ -141,32 +146,43 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, iif = inet_iif(skb); h = route4_fastmap_hash(id, iif); + + spin_lock(&fastmap_lock); if (id == head->fastmap[h].id && iif == head->fastmap[h].iif && (f = head->fastmap[h].filter) != NULL) { - if (f == ROUTE4_FAILURE) + if (f == ROUTE4_FAILURE) { + spin_unlock(&fastmap_lock); goto failure; + } *res = f->res; + spin_unlock(&fastmap_lock); return 0; } + spin_unlock(&fastmap_lock); h = route4_hash_to(id); restart: - b = head->table[h]; + b = rcu_dereference_bh(head->table[h]); if (b) { - for (f = b->ht[route4_hash_from(id)]; f; f = f->next) + for (f = rcu_dereference_bh(b->ht[route4_hash_from(id)]); + f; + f = rcu_dereference_bh(f->next)) if (f->id == id) ROUTE4_APPLY_RESULT(); - for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next) + for (f = rcu_dereference_bh(b->ht[route4_hash_iif(iif)]); + f; + f = rcu_dereference_bh(f->next)) if (f->iif == iif) ROUTE4_APPLY_RESULT(); - for (f = b->ht[route4_hash_wild()]; f; f = f->next) + for (f = rcu_dereference_bh(b->ht[route4_hash_wild()]); + f; + f = rcu_dereference_bh(f->next)) ROUTE4_APPLY_RESULT(); - } if (h < 256) { h = 256; @@ -213,7 +229,7 @@ static inline u32 from_hash(u32 id) static unsigned long route4_get(struct tcf_proto *tp, u32 handle) { - struct route4_head *head = tp->root; + struct route4_head *head = rtnl_dereference(tp->root); struct route4_bucket *b; struct route4_filter *f; unsigned int h1, h2; @@ -229,9 +245,11 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle) if (h2 > 32) return 0; - b = head->table[h1]; + b = rtnl_dereference(head->table[h1]); if (b) { - for (f = b->ht[h2]; f; f = f->next) + for (f = rtnl_dereference(b->ht[h2]); + f; + f = rtnl_dereference(f->next)) if (f->handle == handle) return (unsigned long)f; } @@ -248,16 +266,17 @@ static int route4_init(struct tcf_proto *tp) } static void -route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f) +route4_delete_filter(struct rcu_head *head) { - tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); + struct route4_filter *f = container_of(head, struct route4_filter, rcu); + + tcf_exts_destroy(&f->exts); kfree(f); } static void route4_destroy(struct tcf_proto *tp) { - struct route4_head *head = tp->root; + struct route4_head *head = rtnl_dereference(tp->root); int h1, h2; if (head == NULL) @@ -266,28 +285,36 @@ static void route4_destroy(struct tcf_proto *tp) for (h1 = 0; h1 <= 256; h1++) { struct route4_bucket *b; - b = head->table[h1]; + b = rtnl_dereference(head->table[h1]); if (b) { for (h2 = 0; h2 <= 32; h2++) { struct route4_filter *f; - while ((f = b->ht[h2]) != NULL) { - b->ht[h2] = f->next; - route4_delete_filter(tp, f); + while ((f = rtnl_dereference(b->ht[h2])) != NULL) { + struct route4_filter *next; + + next = rtnl_dereference(f->next); + RCU_INIT_POINTER(b->ht[h2], next); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, route4_delete_filter); } } - kfree(b); + RCU_INIT_POINTER(head->table[h1], NULL); + kfree_rcu(b, rcu); } } - kfree(head); + RCU_INIT_POINTER(tp->root, NULL); + kfree_rcu(head, rcu); } static int route4_delete(struct tcf_proto *tp, unsigned long arg) { - struct route4_head *head = tp->root; - struct route4_filter **fp, *f = (struct route4_filter *)arg; - unsigned int h = 0; + struct route4_head *head = rtnl_dereference(tp->root); + struct route4_filter *f = (struct route4_filter *)arg; + struct route4_filter __rcu **fp; + struct route4_filter *nf; struct route4_bucket *b; + unsigned int h = 0; int i; if (!head || !f) @@ -296,27 +323,36 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg) h = f->handle; b = f->bkt; - for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) { - if (*fp == f) { - tcf_tree_lock(tp); - *fp = f->next; - tcf_tree_unlock(tp); - - route4_reset_fastmap(tp->q, head, f->id); - route4_delete_filter(tp, f); - - /* Strip tree */ - - for (i = 0; i <= 32; i++) - if (b->ht[i]) + fp = &b->ht[from_hash(h >> 16)]; + for (nf = rtnl_dereference(*fp); nf; + fp = &nf->next, nf = rtnl_dereference(*fp)) { + if (nf == f) { + /* unlink it */ + RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); + + /* Remove any fastmap lookups that might ref filter + * notice we unlink'd the filter so we can't get it + * back in the fastmap. + */ + route4_reset_fastmap(head); + + /* Delete it */ + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, route4_delete_filter); + + /* Strip RTNL protected tree */ + for (i = 0; i <= 32; i++) { + struct route4_filter *rt; + + rt = rtnl_dereference(b->ht[i]); + if (rt) return 0; + } /* OK, session has no flows */ - tcf_tree_lock(tp); - head->table[to_hash(h)] = NULL; - tcf_tree_unlock(tp); + RCU_INIT_POINTER(head->table[to_hash(h)], NULL); + kfree_rcu(b, rcu); - kfree(b); return 0; } } @@ -380,26 +416,25 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, } h1 = to_hash(nhandle); - b = head->table[h1]; + b = rtnl_dereference(head->table[h1]); if (!b) { err = -ENOBUFS; b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL); if (b == NULL) goto errout; - tcf_tree_lock(tp); - head->table[h1] = b; - tcf_tree_unlock(tp); + rcu_assign_pointer(head->table[h1], b); } else { unsigned int h2 = from_hash(nhandle >> 16); err = -EEXIST; - for (fp = b->ht[h2]; fp; fp = fp->next) + for (fp = rtnl_dereference(b->ht[h2]); + fp; + fp = rtnl_dereference(fp->next)) if (fp->handle == f->handle) goto errout; } - tcf_tree_lock(tp); if (tb[TCA_ROUTE4_TO]) f->id = to; @@ -410,7 +445,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, f->handle = nhandle; f->bkt = b; - tcf_tree_unlock(tp); + f->tp = tp; if (tb[TCA_ROUTE4_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]); @@ -421,7 +456,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } @@ -431,14 +466,15 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, struct nlattr **tca, unsigned long *arg, bool ovr) { - struct route4_head *head = tp->root; - struct route4_filter *f, *f1, **fp; + struct route4_head *head = rtnl_dereference(tp->root); + struct route4_filter __rcu **fp; + struct route4_filter *fold, *f1, *pfp, *f = NULL; struct route4_bucket *b; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_ROUTE4_MAX + 1]; unsigned int h, th; - u32 old_handle = 0; int err; + bool new = true; if (opt == NULL) return handle ? -EINVAL : 0; @@ -447,70 +483,73 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - f = (struct route4_filter *)*arg; - if (f) { - if (f->handle != handle && handle) + fold = (struct route4_filter *)*arg; + if (fold && handle && fold->handle != handle) return -EINVAL; - if (f->bkt) - old_handle = f->handle; - - err = route4_set_parms(net, tp, base, f, handle, head, tb, - tca[TCA_RATE], 0, ovr); - if (err < 0) - return err; - - goto reinsert; - } - err = -ENOBUFS; if (head == NULL) { head = kzalloc(sizeof(struct route4_head), GFP_KERNEL); if (head == NULL) goto errout; - - tcf_tree_lock(tp); - tp->root = head; - tcf_tree_unlock(tp); + rcu_assign_pointer(tp->root, head); } f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL); - if (f == NULL) + if (!f) goto errout; tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + if (fold) { + f->id = fold->id; + f->iif = fold->iif; + f->res = fold->res; + f->handle = fold->handle; + + f->tp = fold->tp; + f->bkt = fold->bkt; + new = false; + } + err = route4_set_parms(net, tp, base, f, handle, head, tb, - tca[TCA_RATE], 1, ovr); + tca[TCA_RATE], new, ovr); if (err < 0) goto errout; -reinsert: h = from_hash(f->handle >> 16); - for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next) + fp = &f->bkt->ht[h]; + for (pfp = rtnl_dereference(*fp); + (f1 = rtnl_dereference(*fp)) != NULL; + fp = &f1->next) if (f->handle < f1->handle) break; - f->next = f1; - tcf_tree_lock(tp); - *fp = f; + netif_keep_dst(qdisc_dev(tp->q)); + rcu_assign_pointer(f->next, f1); + rcu_assign_pointer(*fp, f); - if (old_handle && f->handle != old_handle) { - th = to_hash(old_handle); - h = from_hash(old_handle >> 16); - b = head->table[th]; + if (fold && fold->handle && f->handle != fold->handle) { + th = to_hash(fold->handle); + h = from_hash(fold->handle >> 16); + b = rtnl_dereference(head->table[th]); if (b) { - for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) { - if (*fp == f) { + fp = &b->ht[h]; + for (pfp = rtnl_dereference(*fp); pfp; + fp = &pfp->next, pfp = rtnl_dereference(*fp)) { + if (pfp == f) { *fp = f->next; break; } } } } - tcf_tree_unlock(tp); - route4_reset_fastmap(tp->q, head, f->id); + route4_reset_fastmap(head); *arg = (unsigned long)f; + if (fold) { + tcf_unbind_filter(tp, &fold->res); + call_rcu(&fold->rcu, route4_delete_filter); + } return 0; errout: @@ -520,7 +559,7 @@ errout: static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct route4_head *head = tp->root; + struct route4_head *head = rtnl_dereference(tp->root); unsigned int h, h1; if (head == NULL) @@ -530,13 +569,15 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) return; for (h = 0; h <= 256; h++) { - struct route4_bucket *b = head->table[h]; + struct route4_bucket *b = rtnl_dereference(head->table[h]); if (b) { for (h1 = 0; h1 <= 32; h1++) { struct route4_filter *f; - for (f = b->ht[h1]; f; f = f->next) { + for (f = rtnl_dereference(b->ht[h1]); + f; + f = rtnl_dereference(f->next)) { if (arg->count < arg->skip) { arg->count++; continue; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 1020e233a5d6..6bb55f277a5a 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -70,31 +70,34 @@ struct rsvp_head { u32 tmap[256/32]; u32 hgenerator; u8 tgenerator; - struct rsvp_session *ht[256]; + struct rsvp_session __rcu *ht[256]; + struct rcu_head rcu; }; struct rsvp_session { - struct rsvp_session *next; - __be32 dst[RSVP_DST_LEN]; - struct tc_rsvp_gpi dpi; - u8 protocol; - u8 tunnelid; + struct rsvp_session __rcu *next; + __be32 dst[RSVP_DST_LEN]; + struct tc_rsvp_gpi dpi; + u8 protocol; + u8 tunnelid; /* 16 (src,sport) hash slots, and one wildcard source slot */ - struct rsvp_filter *ht[16 + 1]; + struct rsvp_filter __rcu *ht[16 + 1]; + struct rcu_head rcu; }; struct rsvp_filter { - struct rsvp_filter *next; - __be32 src[RSVP_DST_LEN]; - struct tc_rsvp_gpi spi; - u8 tunnelhdr; + struct rsvp_filter __rcu *next; + __be32 src[RSVP_DST_LEN]; + struct tc_rsvp_gpi spi; + u8 tunnelhdr; - struct tcf_result res; - struct tcf_exts exts; + struct tcf_result res; + struct tcf_exts exts; - u32 handle; - struct rsvp_session *sess; + u32 handle; + struct rsvp_session *sess; + struct rcu_head rcu; }; static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) @@ -128,7 +131,7 @@ static inline unsigned int hash_src(__be32 *src) static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht; + struct rsvp_head *head = rcu_dereference_bh(tp->root); struct rsvp_session *s; struct rsvp_filter *f; unsigned int h1, h2; @@ -169,7 +172,8 @@ restart: h1 = hash_dst(dst, protocol, tunnelid); h2 = hash_src(src); - for (s = sht[h1]; s; s = s->next) { + for (s = rcu_dereference_bh(head->ht[h1]); s; + s = rcu_dereference_bh(s->next)) { if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] && protocol == s->protocol && !(s->dpi.mask & @@ -181,7 +185,8 @@ restart: #endif tunnelid == s->tunnelid) { - for (f = s->ht[h2]; f; f = f->next) { + for (f = rcu_dereference_bh(s->ht[h2]); f; + f = rcu_dereference_bh(f->next)) { if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] && !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key)) #if RSVP_DST_LEN == 4 @@ -205,7 +210,8 @@ matched: } /* And wildcard bucket... */ - for (f = s->ht[16]; f; f = f->next) { + for (f = rcu_dereference_bh(s->ht[16]); f; + f = rcu_dereference_bh(f->next)) { *res = f->res; RSVP_APPLY_RESULT(); goto matched; @@ -216,9 +222,36 @@ matched: return -1; } +static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h) +{ + struct rsvp_head *head = rtnl_dereference(tp->root); + struct rsvp_session *s; + struct rsvp_filter __rcu **ins; + struct rsvp_filter *pins; + unsigned int h1 = h & 0xFF; + unsigned int h2 = (h >> 8) & 0xFF; + + for (s = rtnl_dereference(head->ht[h1]); s; + s = rtnl_dereference(s->next)) { + for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ; + ins = &pins->next, pins = rtnl_dereference(*ins)) { + if (pins->handle == h) { + RCU_INIT_POINTER(n->next, pins->next); + rcu_assign_pointer(*ins, n); + return; + } + } + } + + /* Something went wrong if we are trying to replace a non-existant + * node. Mind as well halt instead of silently failing. + */ + BUG_ON(1); +} + static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) { - struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht; + struct rsvp_head *head = rtnl_dereference(tp->root); struct rsvp_session *s; struct rsvp_filter *f; unsigned int h1 = handle & 0xFF; @@ -227,8 +260,10 @@ static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) if (h2 > 16) return 0; - for (s = sht[h1]; s; s = s->next) { - for (f = s->ht[h2]; f; f = f->next) { + for (s = rtnl_dereference(head->ht[h1]); s; + s = rtnl_dereference(s->next)) { + for (f = rtnl_dereference(s->ht[h2]); f; + f = rtnl_dereference(f->next)) { if (f->handle == handle) return (unsigned long)f; } @@ -246,7 +281,7 @@ static int rsvp_init(struct tcf_proto *tp) data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL); if (data) { - tp->root = data; + rcu_assign_pointer(tp->root, data); return 0; } return -ENOBUFS; @@ -256,54 +291,55 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) { tcf_unbind_filter(tp, &f->res); - tcf_exts_destroy(tp, &f->exts); - kfree(f); + tcf_exts_destroy(&f->exts); + kfree_rcu(f, rcu); } static void rsvp_destroy(struct tcf_proto *tp) { - struct rsvp_head *data = xchg(&tp->root, NULL); - struct rsvp_session **sht; + struct rsvp_head *data = rtnl_dereference(tp->root); int h1, h2; if (data == NULL) return; - sht = data->ht; + RCU_INIT_POINTER(tp->root, NULL); for (h1 = 0; h1 < 256; h1++) { struct rsvp_session *s; - while ((s = sht[h1]) != NULL) { - sht[h1] = s->next; + while ((s = rtnl_dereference(data->ht[h1])) != NULL) { + RCU_INIT_POINTER(data->ht[h1], s->next); for (h2 = 0; h2 <= 16; h2++) { struct rsvp_filter *f; - while ((f = s->ht[h2]) != NULL) { - s->ht[h2] = f->next; + while ((f = rtnl_dereference(s->ht[h2])) != NULL) { + rcu_assign_pointer(s->ht[h2], f->next); rsvp_delete_filter(tp, f); } } - kfree(s); + kfree_rcu(s, rcu); } } - kfree(data); + kfree_rcu(data, rcu); } static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) { - struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg; + struct rsvp_head *head = rtnl_dereference(tp->root); + struct rsvp_filter *nfp, *f = (struct rsvp_filter *)arg; + struct rsvp_filter __rcu **fp; unsigned int h = f->handle; - struct rsvp_session **sp; - struct rsvp_session *s = f->sess; + struct rsvp_session __rcu **sp; + struct rsvp_session *nsp, *s = f->sess; int i; - for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) { - if (*fp == f) { - tcf_tree_lock(tp); - *fp = f->next; - tcf_tree_unlock(tp); + fp = &s->ht[(h >> 8) & 0xFF]; + for (nfp = rtnl_dereference(*fp); nfp; + fp = &nfp->next, nfp = rtnl_dereference(*fp)) { + if (nfp == f) { + RCU_INIT_POINTER(*fp, f->next); rsvp_delete_filter(tp, f); /* Strip tree */ @@ -313,14 +349,12 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) return 0; /* OK, session has no flows */ - for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF]; - *sp; sp = &(*sp)->next) { - if (*sp == s) { - tcf_tree_lock(tp); - *sp = s->next; - tcf_tree_unlock(tp); - - kfree(s); + sp = &head->ht[h & 0xFF]; + for (nsp = rtnl_dereference(*sp); nsp; + sp = &nsp->next, nsp = rtnl_dereference(*sp)) { + if (nsp == s) { + RCU_INIT_POINTER(*sp, s->next); + kfree_rcu(s, rcu); return 0; } } @@ -333,7 +367,7 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt) { - struct rsvp_head *data = tp->root; + struct rsvp_head *data = rtnl_dereference(tp->root); int i = 0xFFFF; while (i-- > 0) { @@ -361,7 +395,7 @@ static int tunnel_bts(struct rsvp_head *data) static void tunnel_recycle(struct rsvp_head *data) { - struct rsvp_session **sht = data->ht; + struct rsvp_session __rcu **sht = data->ht; u32 tmap[256/32]; int h1, h2; @@ -369,11 +403,13 @@ static void tunnel_recycle(struct rsvp_head *data) for (h1 = 0; h1 < 256; h1++) { struct rsvp_session *s; - for (s = sht[h1]; s; s = s->next) { + for (s = rtnl_dereference(sht[h1]); s; + s = rtnl_dereference(s->next)) { for (h2 = 0; h2 <= 16; h2++) { struct rsvp_filter *f; - for (f = s->ht[h2]; f; f = f->next) { + for (f = rtnl_dereference(s->ht[h2]); f; + f = rtnl_dereference(f->next)) { if (f->tunnelhdr == 0) continue; data->tgenerator = f->res.classid; @@ -417,9 +453,11 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, struct nlattr **tca, unsigned long *arg, bool ovr) { - struct rsvp_head *data = tp->root; - struct rsvp_filter *f, **fp; - struct rsvp_session *s, **sp; + struct rsvp_head *data = rtnl_dereference(tp->root); + struct rsvp_filter *f, *nfp; + struct rsvp_filter __rcu **fp; + struct rsvp_session *nsp, *s; + struct rsvp_session __rcu **sp; struct tc_rsvp_pinfo *pinfo = NULL; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_RSVP_MAX + 1]; @@ -443,15 +481,26 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, f = (struct rsvp_filter *)*arg; if (f) { /* Node exists: adjust only classid */ + struct rsvp_filter *n; if (f->handle != handle && handle) goto errout2; + + n = kmemdup(f, sizeof(*f), GFP_KERNEL); + if (!n) { + err = -ENOMEM; + goto errout2; + } + + tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + if (tb[TCA_RSVP_CLASSID]) { - f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); - tcf_bind_filter(tp, &f->res, base); + n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); + tcf_bind_filter(tp, &n->res, base); } - tcf_exts_change(tp, &f->exts, &e); + tcf_exts_change(tp, &n->exts, &e); + rsvp_replace(tp, n, handle); return 0; } @@ -499,7 +548,9 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, goto errout; } - for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) { + for (sp = &data->ht[h1]; + (s = rtnl_dereference(*sp)) != NULL; + sp = &s->next) { if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && pinfo && pinfo->protocol == s->protocol && memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 && @@ -521,12 +572,16 @@ insert: tcf_exts_change(tp, &f->exts, &e); - for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) - if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask) + fp = &s->ht[h2]; + for (nfp = rtnl_dereference(*fp); nfp; + fp = &nfp->next, nfp = rtnl_dereference(*fp)) { + __u32 mask = nfp->spi.mask & f->spi.mask; + + if (mask != f->spi.mask) break; - f->next = *fp; - wmb(); - *fp = f; + } + RCU_INIT_POINTER(f->next, nfp); + rcu_assign_pointer(*fp, f); *arg = (unsigned long)f; return 0; @@ -546,26 +601,27 @@ insert: s->protocol = pinfo->protocol; s->tunnelid = pinfo->tunnelid; } - for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) { - if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask) + sp = &data->ht[h1]; + for (nsp = rtnl_dereference(*sp); nsp; + sp = &nsp->next, nsp = rtnl_dereference(*sp)) { + if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask) break; } - s->next = *sp; - wmb(); - *sp = s; + RCU_INIT_POINTER(s->next, nsp); + rcu_assign_pointer(*sp, s); goto insert; errout: kfree(f); errout2: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct rsvp_head *head = tp->root; + struct rsvp_head *head = rtnl_dereference(tp->root); unsigned int h, h1; if (arg->stop) @@ -574,11 +630,13 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) for (h = 0; h < 256; h++) { struct rsvp_session *s; - for (s = head->ht[h]; s; s = s->next) { + for (s = rtnl_dereference(head->ht[h]); s; + s = rtnl_dereference(s->next)) { for (h1 = 0; h1 <= 16; h1++) { struct rsvp_filter *f; - for (f = s->ht[h1]; f; f = f->next) { + for (f = rtnl_dereference(s->ht[h1]); f; + f = rtnl_dereference(f->next)) { if (arg->count < arg->skip) { arg->count++; continue; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index c721cd4a469f..30f10fb07f4a 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -32,19 +32,21 @@ struct tcindex_filter_result { struct tcindex_filter { u16 key; struct tcindex_filter_result result; - struct tcindex_filter *next; + struct tcindex_filter __rcu *next; + struct rcu_head rcu; }; struct tcindex_data { struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ - struct tcindex_filter **h; /* imperfect hash; only used if !perfect; - NULL if unused */ + struct tcindex_filter __rcu **h; /* imperfect hash; */ + struct tcf_proto *tp; u16 mask; /* AND key with mask */ - int shift; /* shift ANDed key to the right */ - int hash; /* hash table size; 0 if undefined */ - int alloc_hash; /* allocated size */ - int fall_through; /* 0: only classify if explicit match */ + u32 shift; /* shift ANDed key to the right */ + u32 hash; /* hash table size; 0 if undefined */ + u32 alloc_hash; /* allocated size */ + u32 fall_through; /* 0: only classify if explicit match */ + struct rcu_head rcu; }; static inline int @@ -56,13 +58,18 @@ tcindex_filter_is_set(struct tcindex_filter_result *r) static struct tcindex_filter_result * tcindex_lookup(struct tcindex_data *p, u16 key) { - struct tcindex_filter *f; + if (p->perfect) { + struct tcindex_filter_result *f = p->perfect + key; + + return tcindex_filter_is_set(f) ? f : NULL; + } else if (p->h) { + struct tcindex_filter __rcu **fp; + struct tcindex_filter *f; - if (p->perfect) - return tcindex_filter_is_set(p->perfect + key) ? - p->perfect + key : NULL; - else if (p->h) { - for (f = p->h[key % p->hash]; f; f = f->next) + fp = &p->h[key % p->hash]; + for (f = rcu_dereference_bh_rtnl(*fp); + f; + fp = &f->next, f = rcu_dereference_bh_rtnl(*fp)) if (f->key == key) return &f->result; } @@ -74,7 +81,7 @@ tcindex_lookup(struct tcindex_data *p, u16 key) static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rcu_dereference_bh(tp->root); struct tcindex_filter_result *f; int key = (skb->tc_index & p->mask) >> p->shift; @@ -99,7 +106,7 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r; pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); @@ -129,49 +136,59 @@ static int tcindex_init(struct tcf_proto *tp) p->hash = DEFAULT_HASH_SIZE; p->fall_through = 1; - tp->root = p; + rcu_assign_pointer(tp->root, p); return 0; } - static int -__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) +tcindex_delete(struct tcf_proto *tp, unsigned long arg) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; + struct tcindex_filter __rcu **walk; struct tcindex_filter *f = NULL; - pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f); + pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p\n", tp, arg, p); if (p->perfect) { if (!r->res.class) return -ENOENT; } else { int i; - struct tcindex_filter **walk = NULL; - for (i = 0; i < p->hash; i++) - for (walk = p->h+i; *walk; walk = &(*walk)->next) - if (&(*walk)->result == r) + for (i = 0; i < p->hash; i++) { + walk = p->h + i; + for (f = rtnl_dereference(*walk); f; + walk = &f->next, f = rtnl_dereference(*walk)) { + if (&f->result == r) goto found; + } + } return -ENOENT; found: - f = *walk; - if (lock) - tcf_tree_lock(tp); - *walk = f->next; - if (lock) - tcf_tree_unlock(tp); + rcu_assign_pointer(*walk, rtnl_dereference(f->next)); } tcf_unbind_filter(tp, &r->res); - tcf_exts_destroy(tp, &r->exts); - kfree(f); + tcf_exts_destroy(&r->exts); + if (f) + kfree_rcu(f, rcu); return 0; } -static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) +static int tcindex_destroy_element(struct tcf_proto *tp, + unsigned long arg, + struct tcf_walker *walker) { - return __tcindex_delete(tp, arg, 1); + return tcindex_delete(tp, arg); +} + +static void __tcindex_destroy(struct rcu_head *head) +{ + struct tcindex_data *p = container_of(head, struct tcindex_data, rcu); + + kfree(p->perfect); + kfree(p->h); + kfree(p); } static inline int @@ -194,6 +211,14 @@ static void tcindex_filter_result_init(struct tcindex_filter_result *r) tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } +static void __tcindex_partial_destroy(struct rcu_head *head) +{ + struct tcindex_data *p = container_of(head, struct tcindex_data, rcu); + + kfree(p->perfect); + kfree(p); +} + static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, @@ -203,7 +228,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; struct tcindex_filter_result cr; - struct tcindex_data cp; + struct tcindex_data *cp, *oldp; struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_exts e; @@ -212,89 +237,130 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (err < 0) return err; - memcpy(&cp, p, sizeof(cp)); - tcindex_filter_result_init(&new_filter_result); + err = -ENOMEM; + /* tcindex_data attributes must look atomic to classifier/lookup so + * allocate new tcindex data and RCU assign it onto root. Keeping + * perfect hash and hash pointers from old data. + */ + cp = kzalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) + goto errout; + + cp->mask = p->mask; + cp->shift = p->shift; + cp->hash = p->hash; + cp->alloc_hash = p->alloc_hash; + cp->fall_through = p->fall_through; + cp->tp = tp; + if (p->perfect) { + int i; + + cp->perfect = kmemdup(p->perfect, + sizeof(*r) * cp->hash, GFP_KERNEL); + if (!cp->perfect) + goto errout; + for (i = 0; i < cp->hash; i++) + tcf_exts_init(&cp->perfect[i].exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + balloc = 1; + } + cp->h = p->h; + + tcindex_filter_result_init(&new_filter_result); tcindex_filter_result_init(&cr); if (old_r) cr.res = r->res; if (tb[TCA_TCINDEX_HASH]) - cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); + cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); if (tb[TCA_TCINDEX_MASK]) - cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); + cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); if (tb[TCA_TCINDEX_SHIFT]) - cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); + cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); err = -EBUSY; + /* Hash already allocated, make sure that we still meet the * requirements for the allocated hash. */ - if (cp.perfect) { - if (!valid_perfect_hash(&cp) || - cp.hash > cp.alloc_hash) - goto errout; - } else if (cp.h && cp.hash != cp.alloc_hash) - goto errout; + if (cp->perfect) { + if (!valid_perfect_hash(cp) || + cp->hash > cp->alloc_hash) + goto errout_alloc; + } else if (cp->h && cp->hash != cp->alloc_hash) { + goto errout_alloc; + } err = -EINVAL; if (tb[TCA_TCINDEX_FALL_THROUGH]) - cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); + cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); - if (!cp.hash) { + if (!cp->hash) { /* Hash not specified, use perfect hash if the upper limit * of the hashing index is below the threshold. */ - if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) - cp.hash = (cp.mask >> cp.shift) + 1; + if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) + cp->hash = (cp->mask >> cp->shift) + 1; else - cp.hash = DEFAULT_HASH_SIZE; + cp->hash = DEFAULT_HASH_SIZE; } - if (!cp.perfect && !cp.h) - cp.alloc_hash = cp.hash; + if (!cp->perfect && !cp->h) + cp->alloc_hash = cp->hash; /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) * but then, we'd fail handles that may become valid after some future * mask change. While this is extremely unlikely to ever matter, * the check below is safer (and also more backwards-compatible). */ - if (cp.perfect || valid_perfect_hash(&cp)) - if (handle >= cp.alloc_hash) - goto errout; + if (cp->perfect || valid_perfect_hash(cp)) + if (handle >= cp->alloc_hash) + goto errout_alloc; err = -ENOMEM; - if (!cp.perfect && !cp.h) { - if (valid_perfect_hash(&cp)) { + if (!cp->perfect && !cp->h) { + if (valid_perfect_hash(cp)) { int i; - cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL); - if (!cp.perfect) - goto errout; - for (i = 0; i < cp.hash; i++) - tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT, + cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL); + if (!cp->perfect) + goto errout_alloc; + for (i = 0; i < cp->hash; i++) + tcf_exts_init(&cp->perfect[i].exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); balloc = 1; } else { - cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL); - if (!cp.h) - goto errout; + struct tcindex_filter __rcu **hash; + + hash = kcalloc(cp->hash, + sizeof(struct tcindex_filter *), + GFP_KERNEL); + + if (!hash) + goto errout_alloc; + + cp->h = hash; balloc = 2; } } - if (cp.perfect) - r = cp.perfect + handle; + if (cp->perfect) + r = cp->perfect + handle; else - r = tcindex_lookup(&cp, handle) ? : &new_filter_result; + r = tcindex_lookup(cp, handle) ? : &new_filter_result; if (r == &new_filter_result) { f = kzalloc(sizeof(*f), GFP_KERNEL); if (!f) goto errout_alloc; + f->key = handle; + tcindex_filter_result_init(&f->result); + f->next = NULL; } if (tb[TCA_TCINDEX_CLASSID]) { @@ -307,34 +373,40 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, else tcf_exts_change(tp, &cr.exts, &e); - tcf_tree_lock(tp); if (old_r && old_r != r) tcindex_filter_result_init(old_r); - memcpy(p, &cp, sizeof(cp)); + oldp = p; r->res = cr.res; + rcu_assign_pointer(tp->root, cp); if (r == &new_filter_result) { - struct tcindex_filter **fp; + struct tcindex_filter *nfp; + struct tcindex_filter __rcu **fp; - f->key = handle; - f->result = new_filter_result; - f->next = NULL; - for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next) - /* nothing */; - *fp = f; + tcf_exts_change(tp, &f->result.exts, &r->exts); + + fp = cp->h + (handle % cp->hash); + for (nfp = rtnl_dereference(*fp); + nfp; + fp = &nfp->next, nfp = rtnl_dereference(*fp)) + ; /* nothing */ + + rcu_assign_pointer(*fp, f); } - tcf_tree_unlock(tp); + if (oldp) + call_rcu(&oldp->rcu, __tcindex_partial_destroy); return 0; errout_alloc: if (balloc == 1) - kfree(cp.perfect); + kfree(cp->perfect); else if (balloc == 2) - kfree(cp.h); + kfree(cp->h); errout: - tcf_exts_destroy(tp, &e); + kfree(cp); + tcf_exts_destroy(&e); return err; } @@ -345,7 +417,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, { struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_TCINDEX_MAX + 1]; - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; int err; @@ -364,10 +436,9 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, tca[TCA_RATE], ovr); } - static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter *f, *next; int i; @@ -390,8 +461,8 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) if (!p->h) return; for (i = 0; i < p->hash; i++) { - for (f = p->h[i]; f; f = next) { - next = f->next; + for (f = rtnl_dereference(p->h[i]); f; f = next) { + next = rtnl_dereference(f->next); if (walker->count >= walker->skip) { if (walker->fn(tp, (unsigned long) &f->result, walker) < 0) { @@ -404,35 +475,26 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) } } - -static int tcindex_destroy_element(struct tcf_proto *tp, - unsigned long arg, struct tcf_walker *walker) -{ - return __tcindex_delete(tp, arg, 0); -} - - static void tcindex_destroy(struct tcf_proto *tp) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcf_walker walker; pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); walker.count = 0; walker.skip = 0; - walker.fn = &tcindex_destroy_element; + walker.fn = tcindex_destroy_element; tcindex_walk(tp, &walker); - kfree(p->perfect); - kfree(p->h); - kfree(p); - tp->root = NULL; + + RCU_INIT_POINTER(tp->root, NULL); + call_rcu(&p->rcu, __tcindex_destroy); } static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct tcindex_data *p = tp->root; + struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -455,15 +517,18 @@ static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, nla_nest_end(skb, nest); } else { if (p->perfect) { - t->tcm_handle = r-p->perfect; + t->tcm_handle = r - p->perfect; } else { struct tcindex_filter *f; + struct tcindex_filter __rcu **fp; int i; t->tcm_handle = 0; for (i = 0; !t->tcm_handle && i < p->hash; i++) { - for (f = p->h[i]; !t->tcm_handle && f; - f = f->next) { + fp = &p->h[i]; + for (f = rtnl_dereference(*fp); + !t->tcm_handle && f; + fp = &f->next, f = rtnl_dereference(*fp)) { if (&f->result == r) t->tcm_handle = f->key; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 70c0be8d0121..0472909bb014 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -36,6 +36,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> +#include <linux/percpu.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/bitmap.h> @@ -44,40 +45,49 @@ #include <net/pkt_cls.h> struct tc_u_knode { - struct tc_u_knode *next; + struct tc_u_knode __rcu *next; u32 handle; - struct tc_u_hnode *ht_up; + struct tc_u_hnode __rcu *ht_up; struct tcf_exts exts; #ifdef CONFIG_NET_CLS_IND int ifindex; #endif u8 fshift; struct tcf_result res; - struct tc_u_hnode *ht_down; + struct tc_u_hnode __rcu *ht_down; #ifdef CONFIG_CLS_U32_PERF - struct tc_u32_pcnt *pf; + struct tc_u32_pcnt __percpu *pf; #endif #ifdef CONFIG_CLS_U32_MARK - struct tc_u32_mark mark; + u32 val; + u32 mask; + u32 __percpu *pcpu_success; #endif + struct tcf_proto *tp; + struct rcu_head rcu; + /* The 'sel' field MUST be the last field in structure to allow for + * tc_u32_keys allocated at end of structure. + */ struct tc_u32_sel sel; }; struct tc_u_hnode { - struct tc_u_hnode *next; + struct tc_u_hnode __rcu *next; u32 handle; u32 prio; struct tc_u_common *tp_c; int refcnt; unsigned int divisor; - struct tc_u_knode *ht[1]; + struct tc_u_knode __rcu *ht[1]; + struct rcu_head rcu; }; struct tc_u_common { - struct tc_u_hnode *hlist; + struct tc_u_hnode __rcu *hlist; struct Qdisc *q; int refcnt; u32 hgenerator; + struct rcu_head rcu; }; static inline unsigned int u32_hash_fold(__be32 key, @@ -96,7 +106,7 @@ static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct unsigned int off; } stack[TC_U32_MAXDEPTH]; - struct tc_u_hnode *ht = tp->root; + struct tc_u_hnode *ht = rcu_dereference_bh(tp->root); unsigned int off = skb_network_offset(skb); struct tc_u_knode *n; int sdepth = 0; @@ -108,23 +118,23 @@ static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct int i, r; next_ht: - n = ht->ht[sel]; + n = rcu_dereference_bh(ht->ht[sel]); next_knode: if (n) { struct tc_u32_key *key = n->sel.keys; #ifdef CONFIG_CLS_U32_PERF - n->pf->rcnt += 1; + __this_cpu_inc(n->pf->rcnt); j = 0; #endif #ifdef CONFIG_CLS_U32_MARK - if ((skb->mark & n->mark.mask) != n->mark.val) { - n = n->next; + if ((skb->mark & n->mask) != n->val) { + n = rcu_dereference_bh(n->next); goto next_knode; } else { - n->mark.success++; + __this_cpu_inc(*n->pcpu_success); } #endif @@ -139,37 +149,39 @@ next_knode: if (!data) goto out; if ((*data ^ key->val) & key->mask) { - n = n->next; + n = rcu_dereference_bh(n->next); goto next_knode; } #ifdef CONFIG_CLS_U32_PERF - n->pf->kcnts[j] += 1; + __this_cpu_inc(n->pf->kcnts[j]); j++; #endif } - if (n->ht_down == NULL) { + + ht = rcu_dereference_bh(n->ht_down); + if (!ht) { check_terminal: if (n->sel.flags & TC_U32_TERMINAL) { *res = n->res; #ifdef CONFIG_NET_CLS_IND if (!tcf_match_indev(skb, n->ifindex)) { - n = n->next; + n = rcu_dereference_bh(n->next); goto next_knode; } #endif #ifdef CONFIG_CLS_U32_PERF - n->pf->rhit += 1; + __this_cpu_inc(n->pf->rhit); #endif r = tcf_exts_exec(skb, &n->exts, res); if (r < 0) { - n = n->next; + n = rcu_dereference_bh(n->next); goto next_knode; } return r; } - n = n->next; + n = rcu_dereference_bh(n->next); goto next_knode; } @@ -180,7 +192,7 @@ check_terminal: stack[sdepth].off = off; sdepth++; - ht = n->ht_down; + ht = rcu_dereference_bh(n->ht_down); sel = 0; if (ht->divisor) { __be32 *data, hdata; @@ -222,7 +234,7 @@ check_terminal: /* POP */ if (sdepth--) { n = stack[sdepth].knode; - ht = n->ht_up; + ht = rcu_dereference_bh(n->ht_up); off = stack[sdepth].off; goto check_terminal; } @@ -239,7 +251,9 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) { struct tc_u_hnode *ht; - for (ht = tp_c->hlist; ht; ht = ht->next) + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) if (ht->handle == handle) break; @@ -256,7 +270,9 @@ u32_lookup_key(struct tc_u_hnode *ht, u32 handle) if (sel > ht->divisor) goto out; - for (n = ht->ht[sel]; n; n = n->next) + for (n = rtnl_dereference(ht->ht[sel]); + n; + n = rtnl_dereference(n->next)) if (n->handle == handle) break; out: @@ -270,7 +286,7 @@ static unsigned long u32_get(struct tcf_proto *tp, u32 handle) struct tc_u_common *tp_c = tp->data; if (TC_U32_HTID(handle) == TC_U32_ROOT) - ht = tp->root; + ht = rtnl_dereference(tp->root); else ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); @@ -291,6 +307,9 @@ static u32 gen_new_htid(struct tc_u_common *tp_c) { int i = 0x800; + /* hgenerator only used inside rtnl lock it is safe to increment + * without read _copy_ update semantics + */ do { if (++tp_c->hgenerator == 0x7FF) tp_c->hgenerator = 1; @@ -326,41 +345,78 @@ static int u32_init(struct tcf_proto *tp) } tp_c->refcnt++; - root_ht->next = tp_c->hlist; - tp_c->hlist = root_ht; + RCU_INIT_POINTER(root_ht->next, tp_c->hlist); + rcu_assign_pointer(tp_c->hlist, root_ht); root_ht->tp_c = tp_c; - tp->root = root_ht; + rcu_assign_pointer(tp->root, root_ht); tp->data = tp_c; return 0; } -static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) +static int u32_destroy_key(struct tcf_proto *tp, + struct tc_u_knode *n, + bool free_pf) { - tcf_unbind_filter(tp, &n->res); - tcf_exts_destroy(tp, &n->exts); + tcf_exts_destroy(&n->exts); if (n->ht_down) n->ht_down->refcnt--; #ifdef CONFIG_CLS_U32_PERF - kfree(n->pf); + if (free_pf) + free_percpu(n->pf); +#endif +#ifdef CONFIG_CLS_U32_MARK + if (free_pf) + free_percpu(n->pcpu_success); #endif kfree(n); return 0; } +/* u32_delete_key_rcu should be called when free'ing a copied + * version of a tc_u_knode obtained from u32_init_knode(). When + * copies are obtained from u32_init_knode() the statistics are + * shared between the old and new copies to allow readers to + * continue to update the statistics during the copy. To support + * this the u32_delete_key_rcu variant does not free the percpu + * statistics. + */ +static void u32_delete_key_rcu(struct rcu_head *rcu) +{ + struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); + + u32_destroy_key(key->tp, key, false); +} + +/* u32_delete_key_freepf_rcu is the rcu callback variant + * that free's the entire structure including the statistics + * percpu variables. Only use this if the key is not a copy + * returned by u32_init_knode(). See u32_delete_key_rcu() + * for the variant that should be used with keys return from + * u32_init_knode() + */ +static void u32_delete_key_freepf_rcu(struct rcu_head *rcu) +{ + struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu); + + u32_destroy_key(key->tp, key, true); +} + static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { - struct tc_u_knode **kp; - struct tc_u_hnode *ht = key->ht_up; + struct tc_u_knode __rcu **kp; + struct tc_u_knode *pkp; + struct tc_u_hnode *ht = rtnl_dereference(key->ht_up); if (ht) { - for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) { - if (*kp == key) { - tcf_tree_lock(tp); - *kp = key->next; - tcf_tree_unlock(tp); - - u32_destroy_key(tp, key); + kp = &ht->ht[TC_U32_HASH(key->handle)]; + for (pkp = rtnl_dereference(*kp); pkp; + kp = &pkp->next, pkp = rtnl_dereference(*kp)) { + if (pkp == key) { + RCU_INIT_POINTER(*kp, key->next); + + tcf_unbind_filter(tp, &key->res); + call_rcu(&key->rcu, u32_delete_key_freepf_rcu); return 0; } } @@ -375,10 +431,11 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) unsigned int h; for (h = 0; h <= ht->divisor; h++) { - while ((n = ht->ht[h]) != NULL) { - ht->ht[h] = n->next; - - u32_destroy_key(tp, n); + while ((n = rtnl_dereference(ht->ht[h])) != NULL) { + RCU_INIT_POINTER(ht->ht[h], + rtnl_dereference(n->next)); + tcf_unbind_filter(tp, &n->res); + call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } } @@ -386,28 +443,31 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) { struct tc_u_common *tp_c = tp->data; - struct tc_u_hnode **hn; + struct tc_u_hnode __rcu **hn; + struct tc_u_hnode *phn; WARN_ON(ht->refcnt); u32_clear_hnode(tp, ht); - for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) { - if (*hn == ht) { - *hn = ht->next; - kfree(ht); + hn = &tp_c->hlist; + for (phn = rtnl_dereference(*hn); + phn; + hn = &phn->next, phn = rtnl_dereference(*hn)) { + if (phn == ht) { + RCU_INIT_POINTER(*hn, ht->next); + kfree_rcu(ht, rcu); return 0; } } - WARN_ON(1); return -ENOENT; } static void u32_destroy(struct tcf_proto *tp) { struct tc_u_common *tp_c = tp->data; - struct tc_u_hnode *root_ht = tp->root; + struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); WARN_ON(root_ht == NULL); @@ -419,17 +479,16 @@ static void u32_destroy(struct tcf_proto *tp) tp->q->u32_node = NULL; - for (ht = tp_c->hlist; ht; ht = ht->next) { + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) { ht->refcnt--; u32_clear_hnode(tp, ht); } - while ((ht = tp_c->hlist) != NULL) { - tp_c->hlist = ht->next; - - WARN_ON(ht->refcnt != 0); - - kfree(ht); + while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) { + RCU_INIT_POINTER(tp_c->hlist, ht->next); + kfree_rcu(ht, rcu); } kfree(tp_c); @@ -441,6 +500,7 @@ static void u32_destroy(struct tcf_proto *tp) static int u32_delete(struct tcf_proto *tp, unsigned long arg) { struct tc_u_hnode *ht = (struct tc_u_hnode *)arg; + struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); if (ht == NULL) return 0; @@ -448,7 +508,7 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg) if (TC_U32_KEY(ht->handle)) return u32_delete_key(tp, (struct tc_u_knode *)ht); - if (tp->root == ht) + if (root_ht == ht) return -EINVAL; if (ht->refcnt == 1) { @@ -471,7 +531,9 @@ static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) if (!bitmap) return handle | 0xFFF; - for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next) + for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]); + n; + n = rtnl_dereference(n->next)) set_bit(TC_U32_NODE(n->handle), bitmap); i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); @@ -521,10 +583,8 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, ht_down->refcnt++; } - tcf_tree_lock(tp); - ht_old = n->ht_down; - n->ht_down = ht_down; - tcf_tree_unlock(tp); + ht_old = rtnl_dereference(n->ht_down); + rcu_assign_pointer(n->ht_down, ht_down); if (ht_old) ht_old->refcnt--; @@ -547,10 +607,86 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, return 0; errout: - tcf_exts_destroy(tp, &e); + tcf_exts_destroy(&e); return err; } +static void u32_replace_knode(struct tcf_proto *tp, + struct tc_u_common *tp_c, + struct tc_u_knode *n) +{ + struct tc_u_knode __rcu **ins; + struct tc_u_knode *pins; + struct tc_u_hnode *ht; + + if (TC_U32_HTID(n->handle) == TC_U32_ROOT) + ht = rtnl_dereference(tp->root); + else + ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle)); + + ins = &ht->ht[TC_U32_HASH(n->handle)]; + + /* The node must always exist for it to be replaced if this is not the + * case then something went very wrong elsewhere. + */ + for (pins = rtnl_dereference(*ins); ; + ins = &pins->next, pins = rtnl_dereference(*ins)) + if (pins->handle == n->handle) + break; + + RCU_INIT_POINTER(n->next, pins->next); + rcu_assign_pointer(*ins, n); +} + +static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, + struct tc_u_knode *n) +{ + struct tc_u_knode *new; + struct tc_u32_sel *s = &n->sel; + + new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), + GFP_KERNEL); + + if (!new) + return NULL; + + RCU_INIT_POINTER(new->next, n->next); + new->handle = n->handle; + RCU_INIT_POINTER(new->ht_up, n->ht_up); + +#ifdef CONFIG_NET_CLS_IND + new->ifindex = n->ifindex; +#endif + new->fshift = n->fshift; + new->res = n->res; + RCU_INIT_POINTER(new->ht_down, n->ht_down); + + /* bump reference count as long as we hold pointer to structure */ + if (new->ht_down) + new->ht_down->refcnt++; + +#ifdef CONFIG_CLS_U32_PERF + /* Statistics may be incremented by readers during update + * so we must keep them in tact. When the node is later destroyed + * a special destroy call must be made to not free the pf memory. + */ + new->pf = n->pf; +#endif + +#ifdef CONFIG_CLS_U32_MARK + new->val = n->val; + new->mask = n->mask; + /* Similarly success statistics must be moved as pointers */ + new->pcpu_success = n->pcpu_success; +#endif + new->tp = tp; + memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); + + tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE); + + return new; +} + static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -564,6 +700,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, struct nlattr *tb[TCA_U32_MAX + 1]; u32 htid; int err; +#ifdef CONFIG_CLS_U32_PERF + size_t size; +#endif if (opt == NULL) return handle ? -EINVAL : 0; @@ -574,11 +713,28 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, n = (struct tc_u_knode *)*arg; if (n) { + struct tc_u_knode *new; + if (TC_U32_KEY(n->handle) == 0) return -EINVAL; - return u32_set_parms(net, tp, base, n->ht_up, n, tb, - tca[TCA_RATE], ovr); + new = u32_init_knode(tp, n); + if (!new) + return -ENOMEM; + + err = u32_set_parms(net, tp, base, + rtnl_dereference(n->ht_up), new, tb, + tca[TCA_RATE], ovr); + + if (err) { + u32_destroy_key(tp, new, false); + return err; + } + + u32_replace_knode(tp, tp_c, new); + tcf_unbind_filter(tp, &n->res); + call_rcu(&n->rcu, u32_delete_key_rcu); + return 0; } if (tb[TCA_U32_DIVISOR]) { @@ -601,8 +757,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; - ht->next = tp_c->hlist; - tp_c->hlist = ht; + RCU_INIT_POINTER(ht->next, tp_c->hlist); + rcu_assign_pointer(tp_c->hlist, ht); *arg = (unsigned long)ht; return 0; } @@ -610,7 +766,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (tb[TCA_U32_HASH]) { htid = nla_get_u32(tb[TCA_U32_HASH]); if (TC_U32_HTID(htid) == TC_U32_ROOT) { - ht = tp->root; + ht = rtnl_dereference(tp->root); htid = ht->handle; } else { ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); @@ -618,7 +774,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } } else { - ht = tp->root; + ht = rtnl_dereference(tp->root); htid = ht->handle; } @@ -642,46 +798,62 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -ENOBUFS; #ifdef CONFIG_CLS_U32_PERF - n->pf = kzalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL); - if (n->pf == NULL) { + size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64); + n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt)); + if (!n->pf) { kfree(n); return -ENOBUFS; } #endif memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); - n->ht_up = ht; + RCU_INIT_POINTER(n->ht_up, ht); n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); + n->tp = tp; #ifdef CONFIG_CLS_U32_MARK + n->pcpu_success = alloc_percpu(u32); + if (!n->pcpu_success) { + err = -ENOMEM; + goto errout; + } + if (tb[TCA_U32_MARK]) { struct tc_u32_mark *mark; mark = nla_data(tb[TCA_U32_MARK]); - memcpy(&n->mark, mark, sizeof(struct tc_u32_mark)); - n->mark.success = 0; + n->val = mark->val; + n->mask = mark->mask; } #endif err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr); if (err == 0) { - struct tc_u_knode **ins; - for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) - if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle)) + struct tc_u_knode __rcu **ins; + struct tc_u_knode *pins; + + ins = &ht->ht[TC_U32_HASH(handle)]; + for (pins = rtnl_dereference(*ins); pins; + ins = &pins->next, pins = rtnl_dereference(*ins)) + if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle)) break; - n->next = *ins; - tcf_tree_lock(tp); - *ins = n; - tcf_tree_unlock(tp); + RCU_INIT_POINTER(n->next, pins); + rcu_assign_pointer(*ins, n); *arg = (unsigned long)n; return 0; } + +#ifdef CONFIG_CLS_U32_MARK + free_percpu(n->pcpu_success); +errout: +#endif + #ifdef CONFIG_CLS_U32_PERF - kfree(n->pf); + free_percpu(n->pf); #endif kfree(n); return err; @@ -697,7 +869,9 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) if (arg->stop) return; - for (ht = tp_c->hlist; ht; ht = ht->next) { + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) { if (ht->prio != tp->prio) continue; if (arg->count >= arg->skip) { @@ -708,7 +882,9 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) } arg->count++; for (h = 0; h <= ht->divisor; h++) { - for (n = ht->ht[h]; n; n = n->next) { + for (n = rtnl_dereference(ht->ht[h]); + n; + n = rtnl_dereference(n->next)) { if (arg->count < arg->skip) { arg->count++; continue; @@ -727,6 +903,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct tc_u_knode *n = (struct tc_u_knode *)fh; + struct tc_u_hnode *ht_up, *ht_down; struct nlattr *nest; if (n == NULL) @@ -745,11 +922,18 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor)) goto nla_put_failure; } else { +#ifdef CONFIG_CLS_U32_PERF + struct tc_u32_pcnt *gpf; + int cpu; +#endif + if (nla_put(skb, TCA_U32_SEL, sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), &n->sel)) goto nla_put_failure; - if (n->ht_up) { + + ht_up = rtnl_dereference(n->ht_up); + if (ht_up) { u32 htid = n->handle & 0xFFFFF000; if (nla_put_u32(skb, TCA_U32_HASH, htid)) goto nla_put_failure; @@ -757,14 +941,28 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (n->res.classid && nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid)) goto nla_put_failure; - if (n->ht_down && - nla_put_u32(skb, TCA_U32_LINK, n->ht_down->handle)) + + ht_down = rtnl_dereference(n->ht_down); + if (ht_down && + nla_put_u32(skb, TCA_U32_LINK, ht_down->handle)) goto nla_put_failure; #ifdef CONFIG_CLS_U32_MARK - if ((n->mark.val || n->mark.mask) && - nla_put(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark)) - goto nla_put_failure; + if ((n->val || n->mask)) { + struct tc_u32_mark mark = {.val = n->val, + .mask = n->mask, + .success = 0}; + int cpum; + + for_each_possible_cpu(cpum) { + __u32 cnt = *per_cpu_ptr(n->pcpu_success, cpum); + + mark.success += cnt; + } + + if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark)) + goto nla_put_failure; + } #endif if (tcf_exts_dump(skb, &n->exts) < 0) @@ -779,10 +977,29 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, } #endif #ifdef CONFIG_CLS_U32_PERF + gpf = kzalloc(sizeof(struct tc_u32_pcnt) + + n->sel.nkeys * sizeof(u64), + GFP_KERNEL); + if (!gpf) + goto nla_put_failure; + + for_each_possible_cpu(cpu) { + int i; + struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu); + + gpf->rcnt += pf->rcnt; + gpf->rhit += pf->rhit; + for (i = 0; i < n->sel.nkeys; i++) + gpf->kcnts[i] += pf->kcnts[i]; + } + if (nla_put(skb, TCA_U32_PCNT, sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), - n->pf)) + gpf)) { + kfree(gpf); goto nla_put_failure; + } + kfree(gpf); #endif } diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c index bfd34e4c1afc..ddd883ca55b2 100644 --- a/net/sched/em_canid.c +++ b/net/sched/em_canid.c @@ -120,12 +120,11 @@ static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, return match; } -static int em_canid_change(struct tcf_proto *tp, void *data, int len, +static int em_canid_change(struct net *net, void *data, int len, struct tcf_ematch *m) { struct can_filter *conf = data; /* Array with rules */ struct canid_match *cm; - struct canid_match *cm_old = (struct canid_match *)m->data; int i; if (!len) @@ -181,16 +180,10 @@ static int em_canid_change(struct tcf_proto *tp, void *data, int len, m->datalen = sizeof(struct canid_match) + len; m->data = (unsigned long)cm; - - if (cm_old != NULL) { - pr_err("canid: Configuring an existing ematch!\n"); - kfree(cm_old); - } - return 0; } -static void em_canid_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +static void em_canid_destroy(struct tcf_ematch *m) { struct canid_match *cm = em_canid_priv(m); diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index 527aeb7a3ff0..5b4a4efe468c 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -19,12 +19,11 @@ #include <net/ip.h> #include <net/pkt_cls.h> -static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, +static int em_ipset_change(struct net *net, void *data, int data_len, struct tcf_ematch *em) { struct xt_set_info *set = data; ip_set_id_t index; - struct net *net = dev_net(qdisc_dev(tp->q)); if (data_len != sizeof(*set)) return -EINVAL; @@ -42,11 +41,11 @@ static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, return -ENOMEM; } -static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em) +static void em_ipset_destroy(struct tcf_ematch *em) { const struct xt_set_info *set = (const void *) em->data; if (set) { - ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index); + ip_set_nfnl_put(em->net, set->index); kfree((void *) em->data); } } diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 9b8c0b0e60d7..c8f8c399b99a 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -856,7 +856,7 @@ static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { [TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) }, }; -static int em_meta_change(struct tcf_proto *tp, void *data, int len, +static int em_meta_change(struct net *net, void *data, int len, struct tcf_ematch *m) { int err; @@ -908,7 +908,7 @@ errout: return err; } -static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +static void em_meta_destroy(struct tcf_ematch *m) { if (m) meta_delete((struct meta_match *) m->data); diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index a3bed07a008b..df3110d69585 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -23,7 +23,7 @@ struct nbyte_data { char pattern[0]; }; -static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len, +static int em_nbyte_change(struct net *net, void *data, int data_len, struct tcf_ematch *em) { struct tcf_em_nbyte *nbyte = data; diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 15d353d2e4be..f03c3de16c27 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -45,7 +45,7 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX; } -static int em_text_change(struct tcf_proto *tp, void *data, int len, +static int em_text_change(struct net *net, void *data, int len, struct tcf_ematch *m) { struct text_match *tm; @@ -100,7 +100,7 @@ retry: return 0; } -static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +static void em_text_destroy(struct tcf_ematch *m) { if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) textsearch_destroy(EM_TEXT_PRIV(m)->config); diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 3a633debb6df..6742200b1307 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -178,6 +178,7 @@ static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_hdr *em_hdr = nla_data(nla); int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); + struct net *net = dev_net(qdisc_dev(tp->q)); if (!TCF_EM_REL_VALID(em_hdr->flags)) goto errout; @@ -240,7 +241,7 @@ static int tcf_em_validate(struct tcf_proto *tp, goto errout; if (em->ops->change) { - err = em->ops->change(tp, data, data_len, em); + err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; } else if (data_len > 0) { @@ -271,6 +272,7 @@ static int tcf_em_validate(struct tcf_proto *tp, em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; em->datalen = data_len; + em->net = net; err = 0; errout: @@ -378,7 +380,7 @@ errout: return err; errout_abort: - tcf_em_tree_destroy(tp, tree); + tcf_em_tree_destroy(tree); return err; } EXPORT_SYMBOL(tcf_em_tree_validate); @@ -393,7 +395,7 @@ EXPORT_SYMBOL(tcf_em_tree_validate); * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that * the ematch tree is not in use before calling this function. */ -void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) +void tcf_em_tree_destroy(struct tcf_ematch_tree *tree) { int i; @@ -405,7 +407,7 @@ void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) if (em->ops) { if (em->ops->destroy) - em->ops->destroy(tp, em); + em->ops->destroy(em); else if (!tcf_em_is_simple(em)) kfree((void *) em->data); module_put(em->ops->owner); @@ -526,9 +528,12 @@ pop_stack: match_idx = stack[--stackp]; cur_match = tcf_em_get_match(tree, match_idx); - if (tcf_em_early_end(cur_match, res)) + if (tcf_em_is_inverted(cur_match)) + res = !res; + + if (tcf_em_early_end(cur_match, res)) { goto pop_stack; - else { + } else { match_idx++; goto proceed; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 58bed7599db7..2cf61b3e633c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -578,31 +578,34 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, timer); + rcu_read_lock(); qdisc_unthrottled(wd->qdisc); __netif_schedule(qdisc_root(wd->qdisc)); + rcu_read_unlock(); return HRTIMER_NORESTART; } void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) { - hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle) { if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; - qdisc_throttled(wd->qdisc); + if (throttle) + qdisc_throttled(wd->qdisc); hrtimer_start(&wd->timer, ns_to_ktime(expires), - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); } EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); @@ -763,7 +766,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) cops->put(sch, cl); } sch->q.qlen -= n; - sch->qstats.drops += drops; + __qdisc_qstats_drop(sch, drops); } } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); @@ -942,6 +945,17 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, sch->handle = handle; if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { + if (qdisc_is_percpu_stats(sch)) { + sch->cpu_bstats = + alloc_percpu(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto err_out4; + + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) + goto err_out4; + } + if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB]); if (IS_ERR(stab)) { @@ -964,8 +978,11 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, else root_lock = qdisc_lock(sch); - err = gen_new_estimator(&sch->bstats, &sch->rate_est, - root_lock, tca[TCA_RATE]); + err = gen_new_estimator(&sch->bstats, + sch->cpu_bstats, + &sch->rate_est, + root_lock, + tca[TCA_RATE]); if (err) goto err_out4; } @@ -984,6 +1001,8 @@ err_out: return NULL; err_out4: + free_percpu(sch->cpu_bstats); + free_percpu(sch->cpu_qstats); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. @@ -1022,9 +1041,11 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) because change can't be undone. */ if (sch->flags & TCQ_F_MQROOT) goto out; - gen_replace_estimator(&sch->bstats, &sch->rate_est, - qdisc_root_sleeping_lock(sch), - tca[TCA_RATE]); + gen_replace_estimator(&sch->bstats, + sch->cpu_bstats, + &sch->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); } out: return 0; @@ -1299,11 +1320,14 @@ graft: static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { + struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; struct qdisc_size_table *stab; + __u32 qlen; cond_resched(); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); @@ -1321,7 +1345,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; - q->qstats.qlen = q->q.qlen; + qlen = q->q.qlen; stab = rtnl_dereference(q->stab); if (stab && qdisc_dump_stab(skb, stab) < 0) @@ -1334,9 +1358,14 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) goto nla_put_failure; - if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || + if (qdisc_is_percpu_stats(q)) { + cpu_bstats = q->cpu_bstats; + cpu_qstats = q->cpu_qstats; + } + + if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 || gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 || - gnet_stats_copy_queue(&d, &q->qstats) < 0) + gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) @@ -1781,7 +1810,7 @@ int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, __be16 protocol = skb->protocol; int err; - for (; tp; tp = tp->next) { + for (; tp; tp = rcu_dereference_bh(tp->next)) { if (tp->protocol != protocol && tp->protocol != htons(ETH_P_ALL)) continue; @@ -1833,15 +1862,15 @@ void tcf_destroy(struct tcf_proto *tp) { tp->ops->destroy(tp); module_put(tp->ops->owner); - kfree(tp); + kfree_rcu(tp, rcu); } -void tcf_destroy_chain(struct tcf_proto **fl) +void tcf_destroy_chain(struct tcf_proto __rcu **fl) { struct tcf_proto *tp; - while ((tp = *fl) != NULL) { - *fl = tp->next; + while ((tp = rtnl_dereference(*fl)) != NULL) { + RCU_INIT_POINTER(*fl, tp->next); tcf_destroy(tp); } } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 8449b337f9e3..e3e2cc5fd068 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -41,7 +41,7 @@ struct atm_flow_data { struct Qdisc *q; /* FIFO, TBF, etc. */ - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb); /* chaining */ @@ -273,7 +273,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, error = -ENOBUFS; goto err_out; } - flow->filter_list = NULL; + RCU_INIT_POINTER(flow->filter_list, NULL); flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); if (!flow->q) flow->q = &noop_qdisc; @@ -311,7 +311,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg) pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); if (list_empty(&flow->list)) return -EINVAL; - if (flow->filter_list || flow == &p->link) + if (rcu_access_pointer(flow->filter_list) || flow == &p->link) return -EBUSY; /* * Reference count must be 2: one for "keepalive" (set at class @@ -345,7 +345,8 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct atm_qdisc_data *p = qdisc_priv(sch); struct atm_flow_data *flow = (struct atm_flow_data *)cl; @@ -369,11 +370,12 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) flow = NULL; if (TC_H_MAJ(skb->priority) != sch->handle || !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) { + struct tcf_proto *fl; + list_for_each_entry(flow, &p->flows, list) { - if (flow->filter_list) { - result = tc_classify_compat(skb, - flow->filter_list, - &res); + fl = rcu_dereference_bh(flow->filter_list); + if (fl) { + result = tc_classify_compat(skb, fl, &res); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; @@ -415,7 +417,7 @@ done: if (ret != NET_XMIT_SUCCESS) { drop: __maybe_unused if (net_xmit_drop_count(ret)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); if (flow) flow->qstats.drops++; } @@ -544,7 +546,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) if (!p->link.q) p->link.q = &noop_qdisc; pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - p->link.filter_list = NULL; + RCU_INIT_POINTER(p->link.filter_list, NULL); p->link.vcc = NULL; p->link.sock = NULL; p->link.classid = sch->handle; @@ -635,10 +637,8 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct atm_flow_data *flow = (struct atm_flow_data *)arg; - flow->qstats.qlen = flow->q->q.qlen; - - if (gnet_stats_copy_basic(d, &flow->bstats) < 0 || - gnet_stats_copy_queue(d, &flow->qstats) < 0) + if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 || + gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) return -1; return 0; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index ead526467cca..beeb75f80fdb 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -133,7 +133,7 @@ struct cbq_class { struct gnet_stats_rate_est64 rate_est; struct tc_cbq_xstats xstats; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; int refcnt; int filters; @@ -159,7 +159,6 @@ struct cbq_sched_data { struct cbq_class *tx_borrowed; int tx_len; psched_time_t now; /* Cached timestamp */ - psched_time_t now_rt; /* Cached real time */ unsigned int pmask; struct hrtimer delay_timer; @@ -222,6 +221,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) struct cbq_class **defmap; struct cbq_class *cl = NULL; u32 prio = skb->priority; + struct tcf_proto *fl; struct tcf_result res; /* @@ -236,11 +236,12 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int result = 0; defmap = head->defaults; + fl = rcu_dereference_bh(head->filter_list); /* * Step 2+n. Apply classifier. */ - if (!head->filter_list || - (result = tc_classify_compat(skb, head->filter_list, &res)) < 0) + result = tc_classify_compat(skb, fl, &res); + if (!fl || result < 0) goto fallback; cl = (void *)res.class; @@ -353,12 +354,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) int toplevel = q->toplevel; if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) { - psched_time_t now; - psched_tdiff_t incr; - - now = psched_get_time(); - incr = now - q->now_rt; - now = q->now + incr; + psched_time_t now = psched_get_time(); do { if (cl->undertime < now) { @@ -381,7 +377,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) #endif if (cl == NULL) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -399,7 +395,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) } if (net_xmit_drop_count(ret)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); cbq_mark_toplevel(q, cl); cl->qstats.drops++; } @@ -621,7 +617,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) time = ktime_set(0, 0); time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay)); - hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); + hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED); } qdisc_unthrottled(sch); @@ -654,11 +650,11 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) return 0; } if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return 0; } - sch->qstats.drops++; + qdisc_qstats_drop(sch); return -1; } #endif @@ -700,8 +696,13 @@ cbq_update(struct cbq_sched_data *q) struct cbq_class *this = q->tx_class; struct cbq_class *cl = this; int len = q->tx_len; + psched_time_t now; q->tx_class = NULL; + /* Time integrator. We calculate EOS time + * by adding expected packet transmission time. + */ + now = q->now + L2T(&q->link, len); for ( ; cl; cl = cl->share) { long avgidle = cl->avgidle; @@ -717,7 +718,7 @@ cbq_update(struct cbq_sched_data *q) * idle = (now - last) - last_pktlen/rate */ - idle = q->now - cl->last; + idle = now - cl->last; if ((unsigned long)idle > 128*1024*1024) { avgidle = cl->maxidle; } else { @@ -761,7 +762,7 @@ cbq_update(struct cbq_sched_data *q) idle -= L2T(&q->link, len); idle += L2T(cl, len); - cl->undertime = q->now + idle; + cl->undertime = now + idle; } else { /* Underlimit */ @@ -771,7 +772,8 @@ cbq_update(struct cbq_sched_data *q) else cl->avgidle = avgidle; } - cl->last = q->now; + if ((s64)(now - cl->last) > 0) + cl->last = now; } cbq_update_toplevel(q, this, q->tx_borrowed); @@ -943,31 +945,13 @@ cbq_dequeue(struct Qdisc *sch) struct sk_buff *skb; struct cbq_sched_data *q = qdisc_priv(sch); psched_time_t now; - psched_tdiff_t incr; now = psched_get_time(); - incr = now - q->now_rt; - - if (q->tx_class) { - psched_tdiff_t incr2; - /* Time integrator. We calculate EOS time - * by adding expected packet transmission time. - * If real time is greater, we warp artificial clock, - * so that: - * - * cbq_time = max(real_time, work); - */ - incr2 = L2T(&q->link, q->tx_len); - q->now += incr2; + + if (q->tx_class) cbq_update(q); - if ((incr -= incr2) < 0) - incr = 0; - q->now += incr; - } else { - if (now > q->now) - q->now = now; - } - q->now_rt = now; + + q->now = now; for (;;) { q->wd_expires = 0; @@ -1011,7 +995,7 @@ cbq_dequeue(struct Qdisc *sch) */ if (sch->q.qlen) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (q->wd_expires) qdisc_watchdog_schedule(&q->watchdog, now + q->wd_expires); @@ -1223,7 +1207,6 @@ cbq_reset(struct Qdisc *sch) hrtimer_cancel(&q->delay_timer); q->toplevel = TC_CBQ_MAXLEVEL; q->now = psched_get_time(); - q->now_rt = q->now; for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) q->active[prio] = NULL; @@ -1403,11 +1386,10 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) q->link.minidle = -0x7FFFFFFF; qdisc_watchdog_init(&q->watchdog, sch); - hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); q->delay_timer.function = cbq_undelay; q->toplevel = TC_CBQ_MAXLEVEL; q->now = psched_get_time(); - q->now_rt = q->now; cbq_link_class(&q->link); @@ -1612,16 +1594,15 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; - cl->qstats.qlen = cl->q->q.qlen; cl->xstats.avgidle = cl->avgidle; cl->xstats.undertime = 0; if (cl->undertime != PSCHED_PASTPERFECT) cl->xstats.undertime = cl->undertime - q->now; - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); @@ -1777,7 +1758,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t } if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { @@ -1870,7 +1852,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t goto failure; if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { @@ -1974,7 +1956,8 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) return 0; } -static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) +static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch, + unsigned long arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ed30e436128b..c009eb9045ce 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -57,7 +57,7 @@ struct choke_sched_data { /* Variables */ struct red_vars vars; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct { u32 prob_drop; /* Early probability drops */ u32 prob_mark; /* Early probability marks */ @@ -127,16 +127,22 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) if (idx == q->tail) choke_zap_tail_holes(q); - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); qdisc_tree_decrease_qlen(sch, 1); --sch->q.qlen; } +/* private part of skb->cb[] that a qdisc is allowed to use + * is limited to QDISC_CB_PRIV_LEN bytes. + * As a flow key might be too large, we store a part of it only. + */ +#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3) + struct choke_skb_cb { u16 classid; u8 keys_valid; - struct flow_keys keys; + u8 keys[QDISC_CB_PRIV_LEN - 3]; }; static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) @@ -163,22 +169,26 @@ static u16 choke_get_classid(const struct sk_buff *skb) static bool choke_match_flow(struct sk_buff *skb1, struct sk_buff *skb2) { + struct flow_keys temp; + if (skb1->protocol != skb2->protocol) return false; if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; - skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys); + skb_flow_dissect(skb1, &temp); + memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; - skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys); + skb_flow_dissect(skb2, &temp); + memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN); } return !memcmp(&choke_skb_cb(skb1)->keys, &choke_skb_cb(skb2)->keys, - sizeof(struct flow_keys)); + CHOKE_K_LEN); } /* @@ -193,9 +203,11 @@ static bool choke_classify(struct sk_buff *skb, { struct choke_sched_data *q = qdisc_priv(sch); struct tcf_result res; + struct tcf_proto *fl; int result; - result = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -249,7 +261,7 @@ static bool choke_match_random(const struct choke_sched_data *q, return false; oskb = choke_peek_random(q, pidx); - if (q->filter_list) + if (rcu_access_pointer(q->filter_list)) return choke_get_classid(nskb) == choke_get_classid(oskb); return choke_match_flow(oskb, nskb); @@ -257,11 +269,11 @@ static bool choke_match_random(const struct choke_sched_data *q, static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) { + int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; struct choke_sched_data *q = qdisc_priv(sch); const struct red_parms *p = &q->parms; - int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - if (q->filter_list) { + if (rcu_access_pointer(q->filter_list)) { /* If using external classifiers, get result and record it. */ if (!choke_classify(skb, sch, &ret)) goto other_drop; /* Packet was eaten by filter */ @@ -290,7 +302,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (q->vars.qavg > p->qth_max) { q->vars.qcount = -1; - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (use_harddrop(q) || !use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; @@ -303,7 +315,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->vars.qcount = 0; q->vars.qR = red_random(p); - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (!use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; @@ -320,7 +332,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->tab[q->tail] = skb; q->tail = (q->tail + 1) & q->tab_mask; ++sch->q.qlen; - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; } @@ -333,7 +345,7 @@ congestion_drop: other_drop: if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -353,7 +365,7 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch) q->tab[q->head] = NULL; choke_zap_head_holes(q); --sch->q.qlen; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); return skb; @@ -448,7 +460,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) ntab[tail++] = skb; continue; } - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); --sch->q.qlen; qdisc_drop(skb, sch); } @@ -554,7 +566,8 @@ static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent, return 0; } -static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct choke_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 2f9ab17db85a..de28f8e968e8 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -149,7 +149,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 7bbbfe112192..338706092c27 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -35,7 +35,7 @@ struct drr_class { struct drr_sched { struct list_head active; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct Qdisc_class_hash clhash; }; @@ -88,7 +88,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl != NULL) { if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) @@ -116,7 +117,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->qdisc = &noop_qdisc; if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { @@ -184,7 +185,8 @@ static void drr_put_class(struct Qdisc *sch, unsigned long arg) drr_destroy_class(sch, cl); } -static struct tcf_proto **drr_tcf_chain(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **drr_tcf_chain(struct Qdisc *sch, + unsigned long cl) { struct drr_sched *q = qdisc_priv(sch); @@ -273,17 +275,16 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct drr_class *cl = (struct drr_class *)arg; + __u32 qlen = cl->qdisc->q.qlen; struct tc_drr_stats xstats; memset(&xstats, 0, sizeof(xstats)); - if (cl->qdisc->q.qlen) { + if (qlen) xstats.deficit = cl->deficit; - cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; - } - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); @@ -319,6 +320,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; struct tcf_result res; + struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { @@ -328,7 +330,8 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -356,7 +359,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl = drr_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return err; } @@ -365,7 +368,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return err; } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 49d6ef338b55..227114f27f94 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -37,7 +37,7 @@ struct dsmark_qdisc_data { struct Qdisc *q; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; u8 *mask; /* "owns" the array */ u8 *value; u16 indices; @@ -186,8 +186,8 @@ ignore: } } -static inline struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch, - unsigned long cl) +static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct dsmark_qdisc_data *p = qdisc_priv(sch); return &p->filter_list; @@ -229,7 +229,8 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) skb->tc_index = TC_H_MIN(skb->priority); else { struct tcf_result res; - int result = tc_classify(skb, p->filter_list, &res); + struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); + int result = tc_classify(skb, fl, &res); pr_debug("result %d class 0x%04x\n", result, res.classid); @@ -257,7 +258,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) err = qdisc_enqueue(skb, p->q); if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return err; } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index e15a9eb29087..2e2398cfc694 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -42,7 +42,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* queue full, remove one skb to fulfill the limit */ __qdisc_queue_drop_head(sch, &sch->q); - sch->qstats.drops++; + qdisc_qstats_drop(sch); qdisc_enqueue_tail(skb, sch); return NET_XMIT_CN; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index ba32c2b005d0..cbd7e1fd23b4 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -290,7 +290,7 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) flow->head = skb->next; skb->next = NULL; flow->qlen--; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } return skb; @@ -371,13 +371,12 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) f->qlen++; if (skb_is_retransmit(skb)) q->stat_tcp_retrans++; - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); if (fq_flow_is_detached(f)) { fq_flow_add_tail(&q->new_flows, f); if (time_after(jiffies, f->age + q->flow_refill_delay)) f->credit = max_t(u32, f->credit, q->quantum); q->inactive_flows--; - qdisc_unthrottled(sch); } /* Note: this overwrites f->age */ @@ -385,7 +384,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(f == &q->internal)) { q->stat_internal_packets++; - qdisc_unthrottled(sch); } sch->q.qlen++; @@ -416,7 +414,7 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) static struct sk_buff *fq_dequeue(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_to_ns(ktime_get()); + u64 now = ktime_get_ns(); struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; @@ -433,7 +431,8 @@ begin: if (!head->first) { if (q->time_next_delayed_flow != ~0ULL) qdisc_watchdog_schedule_ns(&q->watchdog, - q->time_next_delayed_flow); + q->time_next_delayed_flow, + false); return NULL; } } @@ -495,7 +494,6 @@ begin: } out: qdisc_bstats_update(sch, skb); - qdisc_unthrottled(sch); return skb; } @@ -787,7 +785,7 @@ nla_put_failure: static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_to_ns(ktime_get()); + u64 now = ktime_get_ns(); struct tc_fq_qd_stats st = { .gc_flows = q->stat_gc_flows, .highprio_packets = q->stat_internal_packets, diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 063b726bf1f8..b9ca32ebc1de 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -52,7 +52,7 @@ struct fq_codel_flow { }; /* please try to keep this structure <= 64 bytes */ struct fq_codel_sched_data { - struct tcf_proto *filter_list; /* optional external classifier */ + struct tcf_proto __rcu *filter_list; /* optional external classifier */ struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ u32 *backlogs; /* backlog table [flows_cnt] */ u32 flows_cnt; /* number of flows */ @@ -77,13 +77,15 @@ static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, hash = jhash_3words((__force u32)keys.dst, (__force u32)keys.src ^ keys.ip_proto, (__force u32)keys.ports, q->perturbation); - return ((u64)hash * q->flows_cnt) >> 32; + + return reciprocal_scale(hash, q->flows_cnt); } static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct fq_codel_sched_data *q = qdisc_priv(sch); + struct tcf_proto *filter; struct tcf_result res; int result; @@ -92,11 +94,12 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, TC_H_MIN(skb->priority) <= q->flows_cnt) return TC_H_MIN(skb->priority); - if (!q->filter_list) + filter = rcu_dereference(q->filter_list); + if (!filter) return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + result = tc_classify(skb, filter, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -161,8 +164,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch) q->backlogs[idx] -= len; kfree_skb(skb); sch->q.qlen--; - sch->qstats.drops++; - sch->qstats.backlog -= len; + qdisc_qstats_drop(sch); + qdisc_qstats_backlog_dec(sch, skb); flow->dropped++; return idx; } @@ -177,7 +180,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) idx = fq_codel_classify(skb, sch, &ret); if (idx == 0) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -187,7 +190,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) flow = &q->flows[idx]; flow_queue_add(flow, skb); q->backlogs[idx] += qdisc_pkt_len(skb); - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&flow->flowchain)) { list_add_tail(&flow->flowchain, &q->new_flows); @@ -495,7 +498,8 @@ static void fq_codel_put(struct Qdisc *q, unsigned long cl) { } -static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **fq_codel_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct fq_codel_sched_data *q = qdisc_priv(sch); @@ -546,7 +550,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.backlog = q->backlogs[idx]; qs.drops = flow->dropped; } - if (gnet_stats_copy_queue(d, &qs) < 0) + if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0) return -1; if (idx < q->flows_cnt) return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e1543b03e39d..6efca30894aa 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -47,7 +47,6 @@ EXPORT_SYMBOL(default_qdisc_ops); static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - skb_dst_force(skb); q->gso_skb = skb; q->qstats.requeues++; q->q.qlen++; /* it's still part of the queue */ @@ -56,24 +55,56 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) return 0; } -static inline struct sk_buff *dequeue_skb(struct Qdisc *q) +static void try_bulk_dequeue_skb(struct Qdisc *q, + struct sk_buff *skb, + const struct netdev_queue *txq, + int *packets) +{ + int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; + + while (bytelimit > 0) { + struct sk_buff *nskb = q->dequeue(q); + + if (!nskb) + break; + + bytelimit -= nskb->len; /* covers GSO len */ + skb->next = nskb; + skb = nskb; + (*packets)++; /* GSO counts as one pkt */ + } + skb->next = NULL; +} + +/* Note that dequeue_skb can possibly return a SKB list (via skb->next). + * A requeued skb (via q->gso_skb) can also be a SKB list. + */ +static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, + int *packets) { struct sk_buff *skb = q->gso_skb; const struct netdev_queue *txq = q->dev_queue; + *packets = 1; + *validate = true; if (unlikely(skb)) { /* check the reason of requeuing without tx lock first */ - txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { q->gso_skb = NULL; q->q.qlen--; } else skb = NULL; + /* skb in gso_skb were already validated */ + *validate = false; } else { - if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) + if (!(q->flags & TCQ_F_ONETXQUEUE) || + !netif_xmit_frozen_or_stopped(txq)) { skb = q->dequeue(q); + if (skb && qdisc_may_bulk(q)) + try_bulk_dequeue_skb(q, skb, txq, packets); + } } - return skb; } @@ -90,7 +121,7 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, * detect it by checking xmit owner and drop the packet when * deadloop is detected. Return OK to try the next skb. */ - kfree_skb(skb); + kfree_skb_list(skb); net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n", dev_queue->dev->name); ret = qdisc_qlen(q); @@ -107,9 +138,9 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, } /* - * Transmit one skb, and handle the return status as required. Holding the - * __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this - * function. + * Transmit possibly several skbs, and handle the return status as + * required. Holding the __QDISC___STATE_RUNNING bit guarantees that + * only one CPU can execute this function. * * Returns to the caller: * 0 - queue is empty or throttled. @@ -117,19 +148,24 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, */ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock) + spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; /* And release qdisc */ spin_unlock(root_lock); - HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_stopped(txq)) - ret = dev_hard_start_xmit(skb, dev, txq); + /* Note that we validate skb (GSO, checksum, ...) outside of locks */ + if (validate) + skb = validate_xmit_skb_list(skb, dev); - HARD_TX_UNLOCK(dev, txq); + if (skb) { + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_stopped(txq)) + skb = dev_hard_start_xmit(skb, dev, txq, &ret); + HARD_TX_UNLOCK(dev, txq); + } spin_lock(root_lock); if (dev_xmit_complete(ret)) { @@ -156,7 +192,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, /* * NOTE: Called under qdisc_lock(q) with locally disabled BH. * - * __QDISC_STATE_RUNNING guarantees only one CPU can process + * __QDISC___STATE_RUNNING guarantees only one CPU can process * this qdisc at a time. qdisc_lock(q) serializes queue accesses for * this queue. * @@ -172,36 +208,39 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, * >0 - queue is not empty. * */ -static inline int qdisc_restart(struct Qdisc *q) +static inline int qdisc_restart(struct Qdisc *q, int *packets) { struct netdev_queue *txq; struct net_device *dev; spinlock_t *root_lock; struct sk_buff *skb; + bool validate; /* Dequeue packet */ - skb = dequeue_skb(q); + skb = dequeue_skb(q, &validate, packets); if (unlikely(!skb)) return 0; - WARN_ON_ONCE(skb_dst_is_noref(skb)); + root_lock = qdisc_lock(q); dev = qdisc_dev(q); - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = skb_get_tx_queue(dev, skb); - return sch_direct_xmit(skb, q, dev, txq, root_lock); + return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); } void __qdisc_run(struct Qdisc *q) { int quota = weight_p; + int packets; - while (qdisc_restart(q)) { + while (qdisc_restart(q, &packets)) { /* * Ordered by possible occurrence: Postpone processing if * 1. we've exceeded packet quota * 2. another process needs the CPU; */ - if (--quota <= 0 || need_resched()) { + quota -= packets; + if (quota <= 0 || need_resched()) { __netif_schedule(q); break; } @@ -518,7 +557,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - skb_queue_head_init(band2list(priv, prio)); + __skb_queue_head_init(band2list(priv, prio)); /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; @@ -616,7 +655,7 @@ void qdisc_reset(struct Qdisc *qdisc) ops->reset(qdisc); if (qdisc->gso_skb) { - kfree_skb(qdisc->gso_skb); + kfree_skb_list(qdisc->gso_skb); qdisc->gso_skb = NULL; qdisc->q.qlen = 0; } @@ -627,6 +666,9 @@ static void qdisc_rcu_free(struct rcu_head *head) { struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); + if (qdisc_is_percpu_stats(qdisc)) + free_percpu(qdisc->cpu_bstats); + kfree((char *) qdisc - qdisc->padded); } @@ -652,7 +694,7 @@ void qdisc_destroy(struct Qdisc *qdisc) module_put(ops->owner); dev_put(qdisc_dev(qdisc)); - kfree_skb(qdisc->gso_skb); + kfree_skb_list(qdisc->gso_skb); /* * gen_estimator est_timer() might access qdisc->q.lock, * wait a RCU grace period before freeing qdisc. @@ -778,7 +820,7 @@ static void dev_deactivate_queue(struct net_device *dev, struct Qdisc *qdisc_default = _qdisc_default; struct Qdisc *qdisc; - qdisc = dev_queue->qdisc; + qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); @@ -871,7 +913,7 @@ static void dev_init_scheduler_queue(struct net_device *dev, { struct Qdisc *qdisc = _qdisc; - dev_queue->qdisc = qdisc; + rcu_assign_pointer(dev_queue->qdisc, qdisc); dev_queue->qdisc_sleeping = qdisc; } diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 12cbc09157fc..a4ca4517cdc8 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -209,7 +209,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_PROB_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; @@ -219,7 +219,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_HARD_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (gred_use_harddrop(t) || !gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index ec8aeaac1dd7..e6c7416d0332 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -116,7 +116,7 @@ struct hfsc_class { struct gnet_stats_queue qstats; struct gnet_stats_rate_est64 rate_est; unsigned int level; /* class level in hierarchy */ - struct tcf_proto *filter_list; /* filter list */ + struct tcf_proto __rcu *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ struct hfsc_sched *sched; /* scheduler data */ @@ -1014,9 +1014,12 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cur_time = psched_get_time(); if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, - qdisc_root_sleeping_lock(sch), - tca[TCA_RATE]); + spinlock_t *lock = qdisc_root_sleeping_lock(sch); + + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + lock, + tca[TCA_RATE]); if (err) return err; } @@ -1063,7 +1066,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -ENOBUFS; if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) { @@ -1161,7 +1164,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; - tcf = q->root.filter_list; + tcf = rcu_dereference_bh(q->root.filter_list); while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -1185,7 +1188,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) return cl; /* hit leaf class */ /* apply inner filter chain */ - tcf = cl->filter_list; + tcf = rcu_dereference_bh(cl->filter_list); head = cl; } @@ -1285,7 +1288,7 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) cl->filter_cnt--; } -static struct tcf_proto ** +static struct tcf_proto __rcu ** hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) { struct hfsc_sched *q = qdisc_priv(sch); @@ -1367,16 +1370,15 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct hfsc_class *cl = (struct hfsc_class *)arg; struct tc_hfsc_stats xstats; - cl->qstats.qlen = cl->qdisc->q.qlen; cl->qstats.backlog = cl->qdisc->qstats.backlog; xstats.level = cl->level; xstats.period = cl->cl_vtperiod; xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); @@ -1588,7 +1590,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl = hfsc_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return err; } @@ -1597,7 +1599,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return err; } @@ -1640,7 +1642,7 @@ hfsc_dequeue(struct Qdisc *sch) */ cl = vttree_get_minvt(&q->root, cur_time); if (cl == NULL) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); hfsc_schedule_watchdog(sch); return NULL; } @@ -1695,7 +1697,7 @@ hfsc_drop(struct Qdisc *sch) list_move_tail(&cl->dlist, &q->droplist); } cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); sch->q.qlen--; return len; } diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index d85b6812a7d4..15d3aabfe250 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -376,8 +376,8 @@ static unsigned int hhf_drop(struct Qdisc *sch) struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; - sch->qstats.drops++; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_drop(sch); + qdisc_qstats_backlog_dec(sch, skb); kfree_skb(skb); } @@ -395,7 +395,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) bucket = &q->buckets[idx]; bucket_add(bucket, skb); - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&bucket->bucketchain)) { unsigned int weight; @@ -457,7 +457,7 @@ begin: if (bucket->head) { skb = dequeue_head(bucket); sch->q.qlen--; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); } if (!skb) { diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 9f949abcacef..f1acb0f60dc3 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -103,7 +103,7 @@ struct htb_class { u32 prio; /* these two are used only by leaves... */ int quantum; /* but stored for parent-to-leaf return */ - struct tcf_proto *filter_list; /* class attached filters */ + struct tcf_proto __rcu *filter_list; /* class attached filters */ int filter_cnt; int refcnt; /* usage count of this class */ @@ -153,7 +153,7 @@ struct htb_sched { int rate2quantum; /* quant = rate / rate2quantum */ /* filters for qdisc itself */ - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; #define HTB_WARN_TOOMANYEVENTS 0x1 unsigned int warned; /* only one warning */ @@ -223,9 +223,9 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, if (cl->level == 0) return cl; /* Start with inner filter chain if a non-leaf class is selected */ - tcf = cl->filter_list; + tcf = rcu_dereference_bh(cl->filter_list); } else { - tcf = q->filter_list; + tcf = rcu_dereference_bh(q->filter_list); } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; @@ -251,7 +251,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, return cl; /* we hit leaf; return it */ /* we have got inner class; apply inner filter chain */ - tcf = cl->filter_list; + tcf = rcu_dereference_bh(cl->filter_list); } /* classification failed; try to use default class */ cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); @@ -586,13 +586,13 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) #ifdef CONFIG_NET_CLS_ACT } else if (!cl) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; #endif } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); cl->qstats.drops++; } return ret; @@ -895,7 +895,7 @@ ok: if (!sch->q.qlen) goto fin; - q->now = ktime_to_ns(ktime_get()); + q->now = ktime_get_ns(); start_at = jiffies; next_event = q->now + 5LLU * NSEC_PER_SEC; @@ -925,14 +925,14 @@ ok: goto ok; } } - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (likely(next_event > q->now)) { if (!test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(q->watchdog.qdisc)->state)) { ktime_t time = ns_to_ktime(next_event); qdisc_throttled(q->watchdog.qdisc); hrtimer_start(&q->watchdog.timer, time, - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); } } else { schedule_work(&q->work); @@ -1044,7 +1044,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); INIT_WORK(&q->work, htb_work_func); - skb_queue_head_init(&q->direct_queue); + __skb_queue_head_init(&q->direct_queue); if (tb[TCA_HTB_DIRECT_QLEN]) q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); @@ -1138,15 +1138,16 @@ static int htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct htb_class *cl = (struct htb_class *)arg; + __u32 qlen = 0; if (!cl->level && cl->un.leaf.q) - cl->qstats.qlen = cl->un.leaf.q->q.qlen; + qlen = cl->un.leaf.q->q.qlen; cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qstats) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); @@ -1225,7 +1226,7 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, parent->un.leaf.q = new_q ? new_q : &noop_qdisc; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; - parent->t_c = ktime_to_ns(ktime_get()); + parent->t_c = ktime_get_ns(); parent->cmode = HTB_CAN_SEND; } @@ -1402,7 +1403,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, goto failure; if (htb_rate_est || tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE] ? : &est.nla); if (err) { @@ -1455,7 +1457,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->tokens = PSCHED_TICKS2NS(hopt->buffer); cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer); cl->mbuffer = 60ULL * NSEC_PER_SEC; /* 1min */ - cl->t_c = ktime_to_ns(ktime_get()); + cl->t_c = ktime_get_ns(); cl->cmode = HTB_CAN_SEND; /* attach to the hash list and parent's family */ @@ -1464,8 +1466,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, parent->children++; } else { if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, - qdisc_root_sleeping_lock(sch), + spinlock_t *lock = qdisc_root_sleeping_lock(sch); + + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + lock, tca[TCA_RATE]); if (err) return err; @@ -1519,11 +1524,12 @@ failure: return err; } -static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg) +static struct tcf_proto __rcu **htb_find_tcf(struct Qdisc *sch, + unsigned long arg) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)arg; - struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list; + struct tcf_proto __rcu **fl = cl ? &cl->filter_list : &q->filter_list; return fl; } diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 62871c14e1f9..eb5b8445fef9 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -17,7 +17,7 @@ struct ingress_qdisc_data { - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; }; /* ------------------------- Class/flow operations ------------------------- */ @@ -46,7 +46,8 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) { } -static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct ingress_qdisc_data *p = qdisc_priv(sch); @@ -59,15 +60,16 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct ingress_qdisc_data *p = qdisc_priv(sch); struct tcf_result res; + struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); int result; - result = tc_classify(skb, p->filter_list, &res); + result = tc_classify(skb, fl, &res); qdisc_bstats_update(sch, skb); switch (result) { case TC_ACT_SHOT: result = TC_ACT_SHOT; - sch->qstats.drops++; + qdisc_qstats_drop(sch); break; case TC_ACT_STOLEN: case TC_ACT_QUEUED: diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index a8b2864a696b..f3cbaecd283a 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -112,7 +112,6 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.qlen += qdisc->qstats.qlen; sch->qstats.backlog += qdisc->qstats.backlog; sch->qstats.drops += qdisc->qstats.drops; sch->qstats.requeues += qdisc->qstats.requeues; @@ -200,9 +199,8 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - sch->qstats.qlen = sch->q.qlen; - if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, &sch->qstats) < 0) + if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || + gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; return 0; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 6749e2f540d0..3811a745452c 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -231,12 +231,11 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) memset(&sch->qstats, 0, sizeof(sch->qstats)); for (i = 0; i < dev->num_tx_queues; i++) { - qdisc = netdev_get_tx_queue(dev, i)->qdisc; + qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); spin_lock_bh(qdisc_lock(qdisc)); sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.qlen += qdisc->qstats.qlen; sch->qstats.backlog += qdisc->qstats.backlog; sch->qstats.drops += qdisc->qstats.drops; sch->qstats.requeues += qdisc->qstats.requeues; @@ -327,6 +326,7 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (cl <= netdev_get_num_tc(dev)) { int i; + __u32 qlen = 0; struct Qdisc *qdisc; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_packed bstats = {0}; @@ -340,11 +340,13 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, spin_unlock_bh(d->lock); for (i = tc.offset; i < tc.offset + tc.count; i++) { - qdisc = netdev_get_tx_queue(dev, i)->qdisc; + struct netdev_queue *q = netdev_get_tx_queue(dev, i); + + qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); + qlen += qdisc->q.qlen; bstats.bytes += qdisc->bstats.bytes; bstats.packets += qdisc->bstats.packets; - qstats.qlen += qdisc->qstats.qlen; qstats.backlog += qdisc->qstats.backlog; qstats.drops += qdisc->qstats.drops; qstats.requeues += qdisc->qstats.requeues; @@ -353,16 +355,16 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, } /* Reclaim root sleeping lock before completing stats */ spin_lock_bh(d->lock); - if (gnet_stats_copy_basic(d, &bstats) < 0 || - gnet_stats_copy_queue(d, &qstats) < 0) + if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 || + gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - sch->qstats.qlen = sch->q.qlen; - if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, &sch->qstats) < 0) + if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 || + gnet_stats_copy_queue(d, NULL, + &sch->qstats, sch->q.qlen) < 0) return -1; } return 0; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index afb050a735fa..42dd218871e0 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -31,7 +31,7 @@ struct multiq_sched_data { u16 bands; u16 max_bands; u16 curband; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct Qdisc **queues; }; @@ -42,10 +42,11 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) struct multiq_sched_data *q = qdisc_priv(sch); u32 band; struct tcf_result res; + struct tcf_proto *fl = rcu_dereference_bh(q->filter_list); int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tc_classify(skb, q->filter_list, &res); + err = tc_classify(skb, fl, &res); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -74,7 +75,7 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -86,7 +87,7 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return ret; } @@ -359,9 +360,8 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - cl_q->qstats.qlen = cl_q->q.qlen; - if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || + gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; return 0; @@ -388,7 +388,8 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **multiq_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct multiq_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 111d70fddaea..b34331967e02 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -429,12 +429,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* Drop packet? */ if (loss_event(q)) { if (q->ecn && INET_ECN_set_ce(skb)) - sch->qstats.drops++; /* mark packet */ + qdisc_qstats_drop(sch); /* mark packet */ else --count; } if (count == 0) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } @@ -478,7 +478,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) return qdisc_reshape_fail(skb, sch); - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); cb = netem_skb_cb(skb); if (q->gap == 0 || /* not doing reordering */ @@ -549,15 +549,14 @@ static unsigned int netem_drop(struct Qdisc *sch) sch->q.qlen--; skb->next = NULL; skb->prev = NULL; - len = qdisc_pkt_len(skb); - sch->qstats.backlog -= len; + qdisc_qstats_backlog_dec(sch, skb); kfree_skb(skb); } } if (!len && q->qdisc && q->qdisc->ops->drop) len = q->qdisc->ops->drop(q->qdisc); if (len) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return len; } @@ -575,7 +574,7 @@ tfifo_dequeue: skb = __skb_dequeue(&sch->q); if (skb) { deliver: - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); return skb; @@ -610,7 +609,7 @@ deliver: if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { - sch->qstats.drops++; + qdisc_qstats_drop(sch); qdisc_tree_decrease_qlen(sch, 1); } } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index fefeeb73f15f..33d7a98a7a97 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -232,7 +232,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __skb_dequeue(&sch->q); - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch); } qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 79359b69ad8d..8e5cd34aaa74 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -24,7 +24,7 @@ struct prio_sched_data { int bands; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; u8 prio2band[TC_PRIO_MAX+1]; struct Qdisc *queues[TCQ_PRIO_BANDS]; }; @@ -36,11 +36,13 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) struct prio_sched_data *q = qdisc_priv(sch); u32 band = skb->priority; struct tcf_result res; + struct tcf_proto *fl; int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { - err = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + err = tc_classify(skb, fl, &res); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -50,7 +52,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) return NULL; } #endif - if (!q->filter_list || err < 0) { + if (!fl || err < 0) { if (TC_H_MAJ(band)) band = 0; return q->queues[q->prio2band[band & TC_PRIO_MAX]]; @@ -75,7 +77,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -87,7 +89,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return ret; } @@ -322,9 +324,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - cl_q->qstats.qlen = cl_q->q.qlen; - if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 || + gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) return -1; return 0; @@ -351,7 +352,8 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **prio_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct prio_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 8056fb4e618a..3ec7e88a43ca 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -181,7 +181,7 @@ struct qfq_group { }; struct qfq_sched { - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; struct Qdisc_class_hash clhash; u64 oldV, V; /* Precise virtual times. */ @@ -459,7 +459,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl != NULL) { /* modify existing class */ if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) @@ -484,7 +485,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->qdisc = &noop_qdisc; if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, &cl->rate_est, + err = gen_new_estimator(&cl->bstats, NULL, + &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); if (err) @@ -576,7 +578,8 @@ static void qfq_put_class(struct Qdisc *sch, unsigned long arg) qfq_destroy_class(sch, cl); } -static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **qfq_tcf_chain(struct Qdisc *sch, + unsigned long cl) { struct qfq_sched *q = qdisc_priv(sch); @@ -661,14 +664,14 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct tc_qfq_stats xstats; memset(&xstats, 0, sizeof(xstats)); - cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; - if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) + gnet_stats_copy_queue(d, NULL, + &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); @@ -704,6 +707,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; struct tcf_result res; + struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { @@ -714,7 +718,8 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + fl = rcu_dereference_bh(q->filter_list); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -1224,7 +1229,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl = qfq_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return err; } @@ -1244,7 +1249,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) pr_debug("qfq_enqueue: enqueue failed %d\n", err); if (net_xmit_drop_count(err)) { cl->qstats.drops++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return err; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 633e32defdcc..6c0534cc7758 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -74,7 +74,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_PROB_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.prob_drop++; goto congestion_drop; @@ -84,7 +84,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_HARD_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (red_use_harddrop(q) || !red_use_ecn(q) || !INET_ECN_set_ce(skb)) { q->stats.forced_drop++; @@ -100,7 +100,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) sch->q.qlen++; } else if (net_xmit_drop_count(ret)) { q->stats.pdrop++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return ret; @@ -142,7 +142,7 @@ static unsigned int red_drop(struct Qdisc *sch) if (child->ops->drop && (len = child->ops->drop(child)) > 0) { q->stats.other++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); sch->q.qlen--; return len; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 9b0f7093d970..5819dd82630d 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -55,7 +55,7 @@ struct sfb_bins { struct sfb_sched_data { struct Qdisc *qdisc; - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; unsigned long rehash_interval; unsigned long warmup_time; /* double buffering warmup time in jiffies */ u32 max; @@ -253,13 +253,13 @@ static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q) return false; } -static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q, +static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, int *qerr, u32 *salt) { struct tcf_result res; int result; - result = tc_classify(skb, q->filter_list, &res); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -281,6 +281,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; + struct tcf_proto *fl; int i; u32 p_min = ~0; u32 minqlen = ~0; @@ -289,7 +290,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct flow_keys keys; if (unlikely(sch->q.qlen >= q->limit)) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); q->stats.queuedrop++; goto drop; } @@ -306,9 +307,10 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } } - if (q->filter_list) { + fl = rcu_dereference_bh(q->filter_list); + if (fl) { /* If using external classifiers, get result and record it. */ - if (!sfb_classify(skb, q, &ret, &salt)) + if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; keys.src = salt; keys.dst = 0; @@ -346,7 +348,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) sfb_skb_cb(skb)->hashes[slot] = 0; if (unlikely(minqlen >= q->max)) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); q->stats.bucketdrop++; goto drop; } @@ -374,7 +376,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } } if (sfb_rate_limit(skb, q)) { - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); q->stats.penaltydrop++; goto drop; } @@ -409,7 +411,7 @@ enqueue: increment_qlen(skb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return ret; @@ -418,7 +420,7 @@ drop: return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -660,7 +662,8 @@ static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **sfb_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct sfb_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 1af2f73906d0..b877140beda5 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -125,7 +125,7 @@ struct sfq_sched_data { u8 cur_depth; /* depth of longest slot */ u8 flags; unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ - struct tcf_proto *filter_list; + struct tcf_proto __rcu *filter_list; sfq_index *ht; /* Hash table ('divisor' slots) */ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ @@ -187,6 +187,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, { struct sfq_sched_data *q = qdisc_priv(sch); struct tcf_result res; + struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority) == sch->handle && @@ -194,13 +195,14 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, TC_H_MIN(skb->priority) <= q->divisor) return TC_H_MIN(skb->priority); - if (!q->filter_list) { + fl = rcu_dereference_bh(q->filter_list); + if (!fl) { skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys); return sfq_hash(q, skb) + 1; } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, q->filter_list, &res); + result = tc_classify(skb, fl, &res); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -310,11 +312,6 @@ static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) slot->skblist_prev = skb; } -#define slot_queue_walk(slot, skb) \ - for (skb = slot->skblist_next; \ - skb != (struct sk_buff *)slot; \ - skb = skb->next) - static unsigned int sfq_drop(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); @@ -334,8 +331,8 @@ drop: sfq_dec(q, x); kfree_skb(skb); sch->q.qlen--; - sch->qstats.drops++; - sch->qstats.backlog -= len; + qdisc_qstats_drop(sch); + qdisc_qstats_backlog_dec(sch, skb); return len; } @@ -382,7 +379,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) hash = sfq_classify(skb, sch, &ret); if (hash == 0) { if (ret & __NET_XMIT_BYPASS) - sch->qstats.drops++; + qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } @@ -412,7 +409,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) break; case RED_PROB_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (sfq_prob_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && @@ -429,7 +426,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) goto congestion_drop; case RED_HARD_MARK: - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); if (sfq_hard_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && @@ -464,7 +461,7 @@ congestion_drop: } enqueue: - sch->qstats.backlog += qdisc_pkt_len(skb); + qdisc_qstats_backlog_inc(sch, skb); slot->backlog += qdisc_pkt_len(skb); slot_queue_add(slot, skb); sfq_inc(q, x); @@ -523,7 +520,7 @@ next_slot: sfq_dec(q, a); qdisc_bstats_update(sch, skb); sch->q.qlen--; - sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); slot->backlog -= qdisc_pkt_len(skb); /* Is the slot empty? */ if (slot->qlen == 0) { @@ -589,7 +586,8 @@ static void sfq_rehash(struct Qdisc *sch) if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) { -drop: sch->qstats.backlog -= qdisc_pkt_len(skb); +drop: + qdisc_qstats_backlog_dec(sch, skb); kfree_skb(skb); dropped++; continue; @@ -841,7 +839,8 @@ static void sfq_put(struct Qdisc *q, unsigned long cl) { } -static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch, + unsigned long cl) { struct sfq_sched_data *q = qdisc_priv(sch); @@ -872,7 +871,7 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.qlen = slot->qlen; qs.backlog = slot->backlog; } - if (gnet_stats_copy_queue(d, &qs) < 0) + if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 18ff63433709..a4afde14e865 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -175,7 +175,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(segs, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); } else { nb++; } @@ -201,7 +201,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) - sch->qstats.drops++; + qdisc_qstats_drop(sch); return ret; } @@ -216,7 +216,7 @@ static unsigned int tbf_drop(struct Qdisc *sch) if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { sch->q.qlen--; - sch->qstats.drops++; + qdisc_qstats_drop(sch); } return len; } @@ -239,7 +239,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) s64 ptoks = 0; unsigned int len = qdisc_pkt_len(skb); - now = ktime_to_ns(ktime_get()); + now = ktime_get_ns(); toks = min_t(s64, now - q->t_c, q->buffer); if (tbf_peak_present(q)) { @@ -268,7 +268,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) } qdisc_watchdog_schedule_ns(&q->watchdog, - now + max_t(long, -toks, -ptoks)); + now + max_t(long, -toks, -ptoks), + true); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, @@ -281,7 +282,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) (cf. CSZ, HPFQ, HFSC) */ - sch->qstats.overlimits++; + qdisc_qstats_overlimit(sch); } return NULL; } @@ -292,7 +293,7 @@ static void tbf_reset(struct Qdisc *sch) qdisc_reset(q->qdisc); sch->q.qlen = 0; - q->t_c = ktime_to_ns(ktime_get()); + q->t_c = ktime_get_ns(); q->tokens = q->buffer; q->ptokens = q->mtu; qdisc_watchdog_cancel(&q->watchdog); @@ -431,7 +432,7 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - q->t_c = ktime_to_ns(ktime_get()); + q->t_c = ktime_get_ns(); qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = &noop_qdisc; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 474167162947..6ada42396a24 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -96,11 +96,14 @@ teql_dequeue(struct Qdisc *sch) struct teql_sched_data *dat = qdisc_priv(sch); struct netdev_queue *dat_queue; struct sk_buff *skb; + struct Qdisc *q; skb = __skb_dequeue(&dat->q); dat_queue = netdev_get_tx_queue(dat->m->dev, 0); + q = rcu_dereference_bh(dat_queue->qdisc); + if (skb == NULL) { - struct net_device *m = qdisc_dev(dat_queue->qdisc); + struct net_device *m = qdisc_dev(q); if (m) { dat->m->slaves = sch; netif_wake_queue(m); @@ -108,7 +111,7 @@ teql_dequeue(struct Qdisc *sch) } else { qdisc_bstats_update(sch, skb); } - sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen; + sch->q.qlen = dat->q.qlen + q->q.qlen; return skb; } @@ -157,9 +160,9 @@ teql_destroy(struct Qdisc *sch) txq = netdev_get_tx_queue(master->dev, 0); master->slaves = NULL; - root_lock = qdisc_root_sleeping_lock(txq->qdisc); + root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc)); spin_lock_bh(root_lock); - qdisc_reset(txq->qdisc); + qdisc_reset(rtnl_dereference(txq->qdisc)); spin_unlock_bh(root_lock); } } @@ -266,7 +269,7 @@ static inline int teql_resolve(struct sk_buff *skb, struct dst_entry *dst = skb_dst(skb); int res; - if (txq->qdisc == &noop_qdisc) + if (rcu_access_pointer(txq->qdisc) == &noop_qdisc) return -ENODEV; if (!dev->header_ops || !dst) @@ -301,7 +304,6 @@ restart: do { struct net_device *slave = qdisc_dev(q); struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0); - const struct net_device_ops *slave_ops = slave->netdev_ops; if (slave_txq->qdisc_sleeping != q) continue; @@ -317,8 +319,8 @@ restart: unsigned int length = qdisc_pkt_len(skb); if (!netif_xmit_frozen_or_stopped(slave_txq) && - slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) { - txq_trans_update(slave_txq); + netdev_start_xmit(skb, slave, slave_txq, false) == + NETDEV_TX_OK) { __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); @@ -468,7 +470,7 @@ static __init void teql_master_setup(struct net_device *dev) dev->tx_queue_len = 100; dev->flags = IFF_NOARP; dev->hard_header_len = LL_MAX_HEADER; - dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + netif_keep_dst(dev); } static LIST_HEAD(master_dev_list); @@ -485,8 +487,8 @@ static int __init teql_init(void) struct net_device *dev; struct teql_master *master; - dev = alloc_netdev(sizeof(struct teql_master), - "teql%d", teql_master_setup); + dev = alloc_netdev(sizeof(struct teql_master), "teql%d", + NET_NAME_UNKNOWN, teql_master_setup); if (!dev) { err = -ENOMEM; break; diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 5c30b7a873df..3b4ffb021cf1 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ protocol.o endpointola.o associola.o \ transport.o chunk.o sm_make_chunk.o ulpevent.o \ - inqueue.o outqueue.o ulpqueue.o command.o \ + inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o ssnmap.o auth.o diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 06a9ee6b2d3a..a88b8524846e 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -813,6 +813,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, else { dst_release(transport->dst); transport->dst = NULL; + ulp_notify = false; } spc_state = SCTP_ADDR_UNREACHABLE; @@ -1244,7 +1245,7 @@ static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr, { u8 score_curr, score_best; - if (best == NULL) + if (best == NULL || curr == best) return curr; score_curr = sctp_trans_score(curr); @@ -1355,14 +1356,11 @@ static void sctp_select_active_and_retran_path(struct sctp_association *asoc) trans_sec = trans_pri; /* If we failed to find a usable transport, just camp on the - * primary or retran, even if they are inactive, if possible - * pick a PF iff it's the better choice. + * active or pick a PF iff it's the better choice. */ if (trans_pri == NULL) { - trans_pri = sctp_trans_elect_best(asoc->peer.primary_path, - asoc->peer.retran_path); - trans_pri = sctp_trans_elect_best(trans_pri, trans_pf); - trans_sec = asoc->peer.primary_path; + trans_pri = sctp_trans_elect_best(asoc->peer.active_path, trans_pf); + trans_sec = trans_pri; } /* Set the active and retran transports. */ diff --git a/net/sctp/command.c b/net/sctp/command.c deleted file mode 100644 index dd7375851618..000000000000 --- a/net/sctp/command.c +++ /dev/null @@ -1,68 +0,0 @@ -/* SCTP kernel implementation Copyright (C) 1999-2001 - * Cisco, Motorola, and IBM - * Copyright 2001 La Monte H.P. Yarroll - * - * This file is part of the SCTP kernel implementation - * - * These functions manipulate sctp command sequences. - * - * This SCTP implementation is free software; - * you can redistribute it and/or modify it under the terms of - * the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This SCTP implementation is distributed in the hope that it - * will be useful, but WITHOUT ANY WARRANTY; without even the implied - * ************************ - * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU CC; see the file COPYING. If not, see - * <http://www.gnu.org/licenses/>. - * - * Please send any bug reports or fixes you make to the - * email address(es): - * lksctp developers <linux-sctp@vger.kernel.org> - * - * Written or modified by: - * La Monte H.P. Yarroll <piggy@acm.org> - * Karl Knutson <karl@athena.chicago.il.us> - */ - -#include <linux/types.h> -#include <net/sctp/sctp.h> -#include <net/sctp/sm.h> - -/* Initialize a block of memory as a command sequence. */ -int sctp_init_cmd_seq(sctp_cmd_seq_t *seq) -{ - memset(seq, 0, sizeof(sctp_cmd_seq_t)); - return 1; /* We always succeed. */ -} - -/* Add a command to a sctp_cmd_seq_t. - * Return 0 if the command sequence is full. - */ -void sctp_add_cmd_sf(sctp_cmd_seq_t *seq, sctp_verb_t verb, sctp_arg_t obj) -{ - BUG_ON(seq->next_free_slot >= SCTP_MAX_NUM_COMMANDS); - - seq->cmds[seq->next_free_slot].verb = verb; - seq->cmds[seq->next_free_slot++].obj = obj; -} - -/* Return the next command structure in a sctp_cmd_seq. - * Returns NULL at the end of the sequence. - */ -sctp_cmd_t *sctp_next_cmd(sctp_cmd_seq_t *seq) -{ - sctp_cmd_t *retval = NULL; - - if (seq->next_cmd < seq->next_free_slot) - retval = &seq->cmds[seq->next_cmd++]; - - return retval; -} - diff --git a/net/sctp/input.c b/net/sctp/input.c index f2e2cbd2d750..b6493b3f11a9 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -133,9 +133,13 @@ int sctp_rcv(struct sk_buff *skb) __skb_pull(skb, skb_transport_offset(skb)); if (skb->len < sizeof(struct sctphdr)) goto discard_it; - if (!sctp_checksum_disable && !skb_csum_unnecessary(skb) && - sctp_rcv_checksum(net, skb) < 0) + + skb->csum_valid = 0; /* Previous value not applicable */ + if (skb_csum_unnecessary(skb)) + __skb_decr_checksum_unnecessary(skb); + else if (!sctp_checksum_disable && sctp_rcv_checksum(net, skb) < 0) goto discard_it; + skb->csum_valid = 1; skb_pull(skb, sizeof(struct sctphdr)); @@ -575,11 +579,6 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) int err; struct net *net = dev_net(skb->dev); - if (skb->len < ihlen + 8) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); - return; - } - /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 1999592ba88c..0e4198ee2370 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -434,7 +434,7 @@ static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk) /* Initialize sk->sk_rcv_saddr from sctp_addr. */ static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk) { - if (addr->sa.sa_family == AF_INET && sctp_sk(sk)->v4mapped) { + if (addr->sa.sa_family == AF_INET) { sk->sk_v6_rcv_saddr.s6_addr32[0] = 0; sk->sk_v6_rcv_saddr.s6_addr32[1] = 0; sk->sk_v6_rcv_saddr.s6_addr32[2] = htonl(0x0000ffff); @@ -448,7 +448,7 @@ static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk) /* Initialize sk->sk_daddr from sctp_addr. */ static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) { - if (addr->sa.sa_family == AF_INET && sctp_sk(sk)->v4mapped) { + if (addr->sa.sa_family == AF_INET) { sk->sk_v6_daddr.s6_addr32[0] = 0; sk->sk_v6_daddr.s6_addr32[1] = 0; sk->sk_v6_daddr.s6_addr32[2] = htonl(0x0000ffff); @@ -556,8 +556,6 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) if (IPV6_ADDR_ANY == type) return 1; if (type == IPV6_ADDR_MAPPED) { - if (sp && !sp->v4mapped) - return 0; if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); @@ -587,8 +585,6 @@ static int sctp_v6_addr_valid(union sctp_addr *addr, /* Note: This routine is used in input, so v4-mapped-v6 * are disallowed here when there is no sctp_sock. */ - if (!sp || !sp->v4mapped) - return 0; if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); @@ -675,11 +671,23 @@ out: return newsk; } -/* Map v4 address to mapped v6 address */ -static void sctp_v6_addr_v4map(struct sctp_sock *sp, union sctp_addr *addr) +/* Format a sockaddr for return to user space. This makes sure the return is + * AF_INET or AF_INET6 depending on the SCTP_I_WANT_MAPPED_V4_ADDR option. + */ +static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { - if (sp->v4mapped && AF_INET == addr->sa.sa_family) - sctp_v4_map_v6(addr); + if (sp->v4mapped) { + if (addr->sa.sa_family == AF_INET) + sctp_v4_map_v6(addr); + } else { + if (addr->sa.sa_family == AF_INET6 && + ipv6_addr_v4mapped(&addr->v6.sin6_addr)) + sctp_v6_map_v4(addr); + } + + if (addr->sa.sa_family == AF_INET) + return sizeof(struct sockaddr_in); + return sizeof(struct sockaddr_in6); } /* Where did this skb come from? */ @@ -706,82 +714,68 @@ static void sctp_v6_ecn_capable(struct sock *sk) inet6_sk(sk)->tclass |= INET_ECN_ECT_0; } -/* Initialize a PF_INET6 socket msg_name. */ -static void sctp_inet6_msgname(char *msgname, int *addr_len) -{ - struct sockaddr_in6 *sin6; - - sin6 = (struct sockaddr_in6 *)msgname; - sin6->sin6_family = AF_INET6; - sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; /*FIXME */ - *addr_len = sizeof(struct sockaddr_in6); -} - /* Initialize a PF_INET msgname from a ulpevent. */ static void sctp_inet6_event_msgname(struct sctp_ulpevent *event, char *msgname, int *addrlen) { - struct sockaddr_in6 *sin6, *sin6from; - - if (msgname) { - union sctp_addr *addr; - struct sctp_association *asoc; - - asoc = event->asoc; - sctp_inet6_msgname(msgname, addrlen); - sin6 = (struct sockaddr_in6 *)msgname; - sin6->sin6_port = htons(asoc->peer.port); - addr = &asoc->peer.primary_addr; + union sctp_addr *addr; + struct sctp_association *asoc; + union sctp_addr *paddr; - /* Note: If we go to a common v6 format, this code - * will change. - */ + if (!msgname) + return; - /* Map ipv4 address into v4-mapped-on-v6 address. */ - if (sctp_sk(asoc->base.sk)->v4mapped && - AF_INET == addr->sa.sa_family) { - sctp_v4_map_v6((union sctp_addr *)sin6); - sin6->sin6_addr.s6_addr32[3] = - addr->v4.sin_addr.s_addr; - return; - } + addr = (union sctp_addr *)msgname; + asoc = event->asoc; + paddr = &asoc->peer.primary_addr; - sin6from = &asoc->peer.primary_addr.v6; - sin6->sin6_addr = sin6from->sin6_addr; - if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = sin6from->sin6_scope_id; + if (paddr->sa.sa_family == AF_INET) { + addr->v4.sin_family = AF_INET; + addr->v4.sin_port = htons(asoc->peer.port); + addr->v4.sin_addr = paddr->v4.sin_addr; + } else { + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_flowinfo = 0; + if (ipv6_addr_type(&paddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) + addr->v6.sin6_scope_id = paddr->v6.sin6_scope_id; + else + addr->v6.sin6_scope_id = 0; + addr->v6.sin6_port = htons(asoc->peer.port); + addr->v6.sin6_addr = paddr->v6.sin6_addr; } + + *addrlen = sctp_v6_addr_to_user(sctp_sk(asoc->base.sk), addr); } /* Initialize a msg_name from an inbound skb. */ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, int *addr_len) { + union sctp_addr *addr; struct sctphdr *sh; - struct sockaddr_in6 *sin6; - - if (msgname) { - sctp_inet6_msgname(msgname, addr_len); - sin6 = (struct sockaddr_in6 *)msgname; - sh = sctp_hdr(skb); - sin6->sin6_port = sh->source; - - /* Map ipv4 address into v4-mapped-on-v6 address. */ - if (sctp_sk(skb->sk)->v4mapped && - ip_hdr(skb)->version == 4) { - sctp_v4_map_v6((union sctp_addr *)sin6); - sin6->sin6_addr.s6_addr32[3] = ip_hdr(skb)->saddr; - return; - } - /* Otherwise, just copy the v6 address. */ - sin6->sin6_addr = ipv6_hdr(skb)->saddr; - if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) { + if (!msgname) + return; + + addr = (union sctp_addr *)msgname; + sh = sctp_hdr(skb); + + if (ip_hdr(skb)->version == 4) { + addr->v4.sin_family = AF_INET; + addr->v4.sin_port = sh->source; + addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr; + } else { + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_flowinfo = 0; + addr->v6.sin6_port = sh->source; + addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; + if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) { struct sctp_ulpevent *ev = sctp_skb2event(skb); - sin6->sin6_scope_id = ev->iif; + addr->v6.sin6_scope_id = ev->iif; } } + + *addr_len = sctp_v6_addr_to_user(sctp_sk(skb->sk), addr); } /* Do we support this AF? */ @@ -857,9 +851,6 @@ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) return 0; } rcu_read_unlock(); - } else if (type == IPV6_ADDR_MAPPED) { - if (!opt->v4mapped) - return 0; } af = opt->pf->af; @@ -914,6 +905,23 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, return 1; } +/* Handle SCTP_I_WANT_MAPPED_V4_ADDR for getpeername() and getsockname() */ +static int sctp_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + int rc; + + rc = inet6_getname(sock, uaddr, uaddr_len, peer); + + if (rc != 0) + return rc; + + *uaddr_len = sctp_v6_addr_to_user(sctp_sk(sock->sk), + (union sctp_addr *)uaddr); + + return rc; +} + static const struct proto_ops inet6_seqpacket_ops = { .family = PF_INET6, .owner = THIS_MODULE, @@ -922,7 +930,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, - .getname = inet6_getname, + .getname = sctp_getname, .poll = sctp_poll, .ioctl = inet6_ioctl, .listen = sctp_inet_listen, @@ -974,8 +982,6 @@ static struct sctp_af sctp_af_inet6 = { .copy_addrlist = sctp_v6_copy_addrlist, .from_skb = sctp_v6_from_skb, .from_sk = sctp_v6_from_sk, - .to_sk_saddr = sctp_v6_to_sk_saddr, - .to_sk_daddr = sctp_v6_to_sk_daddr, .from_addr_param = sctp_v6_from_addr_param, .to_addr_param = sctp_v6_to_addr_param, .cmp_addr = sctp_v6_cmp_addr, @@ -1005,7 +1011,9 @@ static struct sctp_pf sctp_pf_inet6 = { .send_verify = sctp_inet6_send_verify, .supported_addrs = sctp_inet6_supported_addrs, .create_accept_sk = sctp_v6_create_accept_sk, - .addr_v4map = sctp_v6_addr_v4map, + .addr_to_user = sctp_v6_addr_to_user, + .to_sk_saddr = sctp_v6_to_sk_saddr, + .to_sk_daddr = sctp_v6_to_sk_daddr, .af = &sctp_af_inet6, }; diff --git a/net/sctp/output.c b/net/sctp/output.c index 01ab8e0723f0..42dffd428389 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -178,7 +178,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, case SCTP_XMIT_RWND_FULL: case SCTP_XMIT_OK: - case SCTP_XMIT_NAGLE_DELAY: + case SCTP_XMIT_DELAY: break; } @@ -599,7 +599,7 @@ out: return err; no_route: kfree_skb(nskb); - IP_INC_STATS_BH(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES); /* FIXME: Returning the 'err' will effect all the associations * associated with a socket, although only one of the paths of the @@ -633,7 +633,6 @@ nomem: static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { - sctp_xmit_t retval = SCTP_XMIT_OK; size_t datasize, rwnd, inflight, flight_size; struct sctp_transport *transport = packet->transport; struct sctp_association *asoc = transport->asoc; @@ -658,15 +657,11 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, datasize = sctp_data_size(chunk); - if (datasize > rwnd) { - if (inflight > 0) { - /* We have (at least) one data chunk in flight, - * so we can't fall back to rule 6.1 B). - */ - retval = SCTP_XMIT_RWND_FULL; - goto finish; - } - } + if (datasize > rwnd && inflight > 0) + /* We have (at least) one data chunk in flight, + * so we can't fall back to rule 6.1 B). + */ + return SCTP_XMIT_RWND_FULL; /* RFC 2960 6.1 Transmission of DATA Chunks * @@ -680,36 +675,44 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, * When a Fast Retransmit is being performed the sender SHOULD * ignore the value of cwnd and SHOULD NOT delay retransmission. */ - if (chunk->fast_retransmit != SCTP_NEED_FRTX) - if (flight_size >= transport->cwnd) { - retval = SCTP_XMIT_RWND_FULL; - goto finish; - } + if (chunk->fast_retransmit != SCTP_NEED_FRTX && + flight_size >= transport->cwnd) + return SCTP_XMIT_RWND_FULL; /* Nagle's algorithm to solve small-packet problem: * Inhibit the sending of new chunks when new outgoing data arrives * if any previously transmitted data on the connection remains * unacknowledged. */ - if (!sctp_sk(asoc->base.sk)->nodelay && sctp_packet_empty(packet) && - inflight && sctp_state(asoc, ESTABLISHED)) { - unsigned int max = transport->pathmtu - packet->overhead; - unsigned int len = chunk->skb->len + q->out_qlen; - - /* Check whether this chunk and all the rest of pending - * data will fit or delay in hopes of bundling a full - * sized packet. - * Don't delay large message writes that may have been - * fragmeneted into small peices. - */ - if ((len < max) && chunk->msg->can_delay) { - retval = SCTP_XMIT_NAGLE_DELAY; - goto finish; - } - } -finish: - return retval; + if (sctp_sk(asoc->base.sk)->nodelay) + /* Nagle disabled */ + return SCTP_XMIT_OK; + + if (!sctp_packet_empty(packet)) + /* Append to packet */ + return SCTP_XMIT_OK; + + if (inflight == 0) + /* Nothing unacked */ + return SCTP_XMIT_OK; + + if (!sctp_state(asoc, ESTABLISHED)) + return SCTP_XMIT_OK; + + /* Check whether this chunk and all the rest of pending data will fit + * or delay in hopes of bundling a full sized packet. + */ + if (chunk->skb->len + q->out_qlen >= transport->pathmtu - packet->overhead) + /* Enough data queued to fill a packet */ + return SCTP_XMIT_OK; + + /* Don't delay large message writes that may have been fragmented */ + if (!chunk->msg->can_delay) + return SCTP_XMIT_OK; + + /* Defer until all data acked or packet full */ + return SCTP_XMIT_DELAY; } /* This private function does management things when adding DATA chunk */ diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 9c77947c0597..7e8f0a117106 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -629,7 +629,7 @@ redo: done = 1; break; - case SCTP_XMIT_NAGLE_DELAY: + case SCTP_XMIT_DELAY: /* Send this packet. */ error = sctp_packet_transmit(pkt); @@ -1015,7 +1015,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) switch (status) { case SCTP_XMIT_PMTU_FULL: case SCTP_XMIT_RWND_FULL: - case SCTP_XMIT_NAGLE_DELAY: + case SCTP_XMIT_DELAY: /* We could not append this chunk, so put * the chunk back on the output queue. */ @@ -1025,7 +1025,6 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) sctp_outq_head_data(q, chunk); goto sctp_flush_out; - break; case SCTP_XMIT_OK: /* The sender is in the SHUTDOWN-PENDING state, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6789d785e698..8f34b27d5775 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -366,7 +366,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && !sp->inet.freebind && - !sysctl_ip_nonlocal_bind) + !net->ipv4.sysctl_ip_nonlocal_bind) return 0; if (ipv6_only_sock(sctp_opt2sk(sp))) @@ -576,10 +576,10 @@ out: return newsk; } -/* Map address, empty for v4 family */ -static void sctp_v4_addr_v4map(struct sctp_sock *sp, union sctp_addr *addr) +static int sctp_v4_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { - /* Empty */ + /* No address mapping for V4 sockets */ + return sizeof(struct sockaddr_in); } /* Dump the v4 addr to the seq file. */ @@ -976,7 +976,9 @@ static struct sctp_pf sctp_pf_inet = { .send_verify = sctp_inet_send_verify, .supported_addrs = sctp_inet_supported_addrs, .create_accept_sk = sctp_v4_create_accept_sk, - .addr_v4map = sctp_v4_addr_v4map, + .addr_to_user = sctp_v4_addr_to_user, + .to_sk_saddr = sctp_v4_to_sk_saddr, + .to_sk_daddr = sctp_v4_to_sk_daddr, .af = &sctp_af_inet }; @@ -1047,8 +1049,6 @@ static struct sctp_af sctp_af_inet = { .copy_addrlist = sctp_v4_copy_addrlist, .from_skb = sctp_v4_from_skb, .from_sk = sctp_v4_from_sk, - .to_sk_saddr = sctp_v4_to_sk_saddr, - .to_sk_daddr = sctp_v4_to_sk_daddr, .from_addr_param = sctp_v4_from_addr_param, .to_addr_param = sctp_v4_to_addr_param, .cmp_addr = sctp_v4_cmp_addr, @@ -1341,7 +1341,7 @@ static __init int sctp_init(void) if (!sctp_chunk_cachep) goto err_chunk_cachep; - status = percpu_counter_init(&sctp_sockets_allocated, 0); + status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL); if (status) goto err_percpu_counter_init; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5170a1ff95a1..c8f606324134 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1775,9 +1775,22 @@ static sctp_disposition_t sctp_sf_do_dupcook_a(struct net *net, /* Update the content of current association. */ sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); - sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, - SCTP_STATE(SCTP_STATE_ESTABLISHED)); - sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + if (sctp_state(asoc, SHUTDOWN_PENDING) && + (sctp_sstate(asoc->base.sk, CLOSING) || + sock_flag(asoc->base.sk, SOCK_DEAD))) { + /* if were currently in SHUTDOWN_PENDING, but the socket + * has been closed by user, don't transition to ESTABLISHED. + * Instead trigger SHUTDOWN bundled with COOKIE_ACK. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + return sctp_sf_do_9_2_start_shutdown(net, ep, asoc, + SCTP_ST_CHUNK(0), NULL, + commands); + } else { + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_ESTABLISHED)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + } return SCTP_DISPOSITION_CONSUME; nomem_ev: @@ -4182,7 +4195,6 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, case SCTP_CID_ACTION_DISCARD: /* Discard the packet. */ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - break; case SCTP_CID_ACTION_DISCARD_ERR: /* Generate an ERROR chunk as response. */ hdr = unk_chunk->chunk_hdr; @@ -4198,11 +4210,9 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, /* Discard the packet. */ sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); return SCTP_DISPOSITION_CONSUME; - break; case SCTP_CID_ACTION_SKIP: /* Skip the chunk. */ return SCTP_DISPOSITION_DISCARD; - break; case SCTP_CID_ACTION_SKIP_ERR: /* Generate an ERROR chunk as response. */ hdr = unk_chunk->chunk_hdr; @@ -4216,7 +4226,6 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, } /* Skip the chunk. */ return SCTP_DISPOSITION_CONSUME; - break; default: break; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 429899689408..634a2abb5f3a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -254,7 +254,7 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk, if (id_asoc && (id_asoc != addr_asoc)) return NULL; - sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), + sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk), (union sctp_addr *)addr); return transport; @@ -396,7 +396,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) /* Copy back into socket for getsockname() use. */ if (!ret) { inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num); - af->to_sk_saddr(addr, sk); + sp->pf->to_sk_saddr(addr, sk); } return ret; @@ -1053,7 +1053,6 @@ static int __sctp_connect(struct sock *sk, struct sctp_association *asoc2; struct sctp_transport *transport; union sctp_addr to; - struct sctp_af *af; sctp_scope_t scope; long timeo; int err = 0; @@ -1081,6 +1080,8 @@ static int __sctp_connect(struct sock *sk, /* Walk through the addrs buffer and count the number of addresses. */ addr_buf = kaddrs; while (walk_size < addrs_size) { + struct sctp_af *af; + if (walk_size + sizeof(sa_family_t) > addrs_size) { err = -EINVAL; goto out_free; @@ -1205,8 +1206,7 @@ static int __sctp_connect(struct sock *sk, /* Initialize sk's dport and daddr for getpeername() */ inet_sk(sk)->inet_dport = htons(asoc->peer.port); - af = sctp_get_af_specific(sa_addr->sa.sa_family); - af->to_sk_daddr(sa_addr, sk); + sp->pf->to_sk_daddr(sa_addr, sk); sk->sk_err = 0; /* in-kernel sockets don't generally have a file allocated to them @@ -1602,12 +1602,13 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, struct sctp_initmsg *sinit; sctp_assoc_t associd = 0; sctp_cmsgs_t cmsgs = { NULL }; - int err; sctp_scope_t scope; - long timeo; - __u16 sinfo_flags = 0; + bool fill_sinfo_ttl = false; struct sctp_datamsg *datamsg; int msg_flags = msg->msg_flags; + __u16 sinfo_flags = 0; + long timeo; + int err; err = 0; sp = sctp_sk(sk); @@ -1648,10 +1649,21 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, msg_name = msg->msg_name; } - sinfo = cmsgs.info; sinit = cmsgs.init; + if (cmsgs.sinfo != NULL) { + memset(&default_sinfo, 0, sizeof(default_sinfo)); + default_sinfo.sinfo_stream = cmsgs.sinfo->snd_sid; + default_sinfo.sinfo_flags = cmsgs.sinfo->snd_flags; + default_sinfo.sinfo_ppid = cmsgs.sinfo->snd_ppid; + default_sinfo.sinfo_context = cmsgs.sinfo->snd_context; + default_sinfo.sinfo_assoc_id = cmsgs.sinfo->snd_assoc_id; - /* Did the user specify SNDRCVINFO? */ + sinfo = &default_sinfo; + fill_sinfo_ttl = true; + } else { + sinfo = cmsgs.srinfo; + } + /* Did the user specify SNDINFO/SNDRCVINFO? */ if (sinfo) { sinfo_flags = sinfo->sinfo_flags; associd = sinfo->sinfo_assoc_id; @@ -1858,8 +1870,8 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, pr_debug("%s: we have a valid association\n", __func__); if (!sinfo) { - /* If the user didn't specify SNDRCVINFO, make up one with - * some defaults. + /* If the user didn't specify SNDINFO/SNDRCVINFO, make up + * one with some defaults. */ memset(&default_sinfo, 0, sizeof(default_sinfo)); default_sinfo.sinfo_stream = asoc->default_stream; @@ -1868,7 +1880,13 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, default_sinfo.sinfo_context = asoc->default_context; default_sinfo.sinfo_timetolive = asoc->default_timetolive; default_sinfo.sinfo_assoc_id = sctp_assoc2id(asoc); + sinfo = &default_sinfo; + } else if (fill_sinfo_ttl) { + /* In case SNDINFO was specified, we still need to fill + * it with a default ttl from the assoc here. + */ + sinfo->sinfo_timetolive = asoc->default_timetolive; } /* API 7.1.7, the sndbuf size per association bounds the @@ -2042,8 +2060,6 @@ static int sctp_skb_pull(struct sk_buff *skb, int len) * flags - flags sent or received with the user message, see Section * 5 for complete description of the flags. */ -static struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *); - static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) @@ -2094,9 +2110,16 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, sp->pf->skb_msgname(skb, msg->msg_name, addr_len); } + /* Check if we allow SCTP_NXTINFO. */ + if (sp->recvnxtinfo) + sctp_ulpevent_read_nxtinfo(event, msg, sk); + /* Check if we allow SCTP_RCVINFO. */ + if (sp->recvrcvinfo) + sctp_ulpevent_read_rcvinfo(event, msg); /* Check if we allow SCTP_SNDRCVINFO. */ if (sp->subscribe.sctp_data_io_event) sctp_ulpevent_read_sndrcvinfo(event, msg); + #if 0 /* FIXME: we should be calling IP/IPv6 layers. */ if (sk->sk_protinfo.af_inet.cmsg_flags) @@ -2182,8 +2205,13 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen)) return -EFAULT; - /* - * At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, + if (sctp_sk(sk)->subscribe.sctp_data_io_event) + pr_warn_ratelimited(DEPRECATED "%s (pid %d) " + "Requested SCTP_SNDRCVINFO event.\n" + "Use SCTP_RCVINFO through SCTP_RECVRCVINFO option instead.\n", + current->comm, task_pid_nr(current)); + + /* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, * if there is no data to be sent or retransmit, the stack will * immediately send up this notification. */ @@ -2747,19 +2775,22 @@ static int sctp_setsockopt_default_send_param(struct sock *sk, char __user *optval, unsigned int optlen) { - struct sctp_sndrcvinfo info; - struct sctp_association *asoc; struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndrcvinfo info; - if (optlen != sizeof(struct sctp_sndrcvinfo)) + if (optlen != sizeof(info)) return -EINVAL; if (copy_from_user(&info, optval, optlen)) return -EFAULT; + if (info.sinfo_flags & + ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_ABORT | SCTP_EOF)) + return -EINVAL; asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - if (asoc) { asoc->default_stream = info.sinfo_stream; asoc->default_flags = info.sinfo_flags; @@ -2777,6 +2808,44 @@ static int sctp_setsockopt_default_send_param(struct sock *sk, return 0; } +/* RFC6458, Section 8.1.31. Set/get Default Send Parameters + * (SCTP_DEFAULT_SNDINFO) + */ +static int sctp_setsockopt_default_sndinfo(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndinfo info; + + if (optlen != sizeof(info)) + return -EINVAL; + if (copy_from_user(&info, optval, optlen)) + return -EFAULT; + if (info.snd_flags & + ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_ABORT | SCTP_EOF)) + return -EINVAL; + + asoc = sctp_id2assoc(sk, info.snd_assoc_id); + if (!asoc && info.snd_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + if (asoc) { + asoc->default_stream = info.snd_sid; + asoc->default_flags = info.snd_flags; + asoc->default_ppid = info.snd_ppid; + asoc->default_context = info.snd_context; + } else { + sp->default_stream = info.snd_sid; + sp->default_flags = info.snd_flags; + sp->default_ppid = info.snd_ppid; + sp->default_context = info.snd_context; + } + + return 0; +} + /* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR) * * Requests that the local SCTP stack use the enclosed peer address as @@ -3523,7 +3592,6 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval, return 0; } - /* * SCTP_PEER_ADDR_THLDS * @@ -3574,6 +3642,38 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, return 0; } +static int sctp_setsockopt_recvrcvinfo(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *) optval)) + return -EFAULT; + + sctp_sk(sk)->recvrcvinfo = (val == 0) ? 0 : 1; + + return 0; +} + +static int sctp_setsockopt_recvnxtinfo(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *) optval)) + return -EFAULT; + + sctp_sk(sk)->recvnxtinfo = (val == 0) ? 0 : 1; + + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -3671,6 +3771,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_default_send_param(sk, optval, optlen); break; + case SCTP_DEFAULT_SNDINFO: + retval = sctp_setsockopt_default_sndinfo(sk, optval, optlen); + break; case SCTP_PRIMARY_ADDR: retval = sctp_setsockopt_primary_addr(sk, optval, optlen); break; @@ -3725,6 +3828,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_PEER_ADDR_THLDS: retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen); break; + case SCTP_RECVRCVINFO: + retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen); + break; + case SCTP_RECVNXTINFO: + retval = sctp_setsockopt_recvnxtinfo(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -3971,6 +4080,9 @@ static int sctp_init_sock(struct sock *sk) /* Enable Nagle algorithm by default. */ sp->nodelay = 0; + sp->recvrcvinfo = 0; + sp->recvnxtinfo = 0; + /* Enable by default. */ sp->v4mapped = 1; @@ -4131,7 +4243,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, transport = asoc->peer.primary_path; status.sstat_assoc_id = sctp_assoc2id(asoc); - status.sstat_state = asoc->state; + status.sstat_state = sctp_assoc_to_state(asoc); status.sstat_rwnd = asoc->peer.rwnd; status.sstat_unackdata = asoc->unack_data; @@ -4143,7 +4255,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr, transport->af_specific->sockaddr_len); /* Map ipv4 address into v4-mapped-on-v6 address. */ - sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), + sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk), (union sctp_addr *)&status.sstat_primary.spinfo_address); status.sstat_primary.spinfo_state = transport->state; status.sstat_primary.spinfo_cwnd = transport->cwnd; @@ -4301,8 +4413,8 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) { struct sctp_association *asoc = sctp_id2assoc(sk, id); + struct sctp_sock *sp = sctp_sk(sk); struct socket *sock; - struct sctp_af *af; int err = 0; if (!asoc) @@ -4324,8 +4436,7 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) /* Make peeled-off sockets more like 1-1 accepted sockets. * Set the daddr and initialize id to something more random */ - af = sctp_get_af_specific(asoc->peer.primary_addr.sa.sa_family); - af->to_sk_daddr(&asoc->peer.primary_addr, sk); + sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sk); /* Populate the fields of the newsk from the oldsk and migrate the * asoc to the newsk. @@ -4709,8 +4820,8 @@ static int sctp_getsockopt_peer_addrs(struct sock *sk, int len, list_for_each_entry(from, &asoc->peer.transport_addr_list, transports) { memcpy(&temp, &from->ipaddr, sizeof(temp)); - sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); - addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; + addrlen = sctp_get_pf_specific(sk->sk_family) + ->addr_to_user(sp, &temp); if (space_left < addrlen) return -ENOMEM; if (copy_to_user(to, &temp, addrlen)) @@ -4754,9 +4865,9 @@ static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to, if (!temp.v4.sin_port) temp.v4.sin_port = htons(port); - sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), - &temp); - addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; + addrlen = sctp_get_pf_specific(sk->sk_family) + ->addr_to_user(sctp_sk(sk), &temp); + if (space_left < addrlen) { cnt = -ENOMEM; break; @@ -4844,8 +4955,8 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, */ list_for_each_entry(addr, &bp->address_list, list) { memcpy(&temp, &addr->a, sizeof(temp)); - sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); - addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; + addrlen = sctp_get_pf_specific(sk->sk_family) + ->addr_to_user(sp, &temp); if (space_left < addrlen) { err = -ENOMEM; /*fixme: right error?*/ goto out; @@ -4904,7 +5015,7 @@ static int sctp_getsockopt_primary_addr(struct sock *sk, int len, memcpy(&prim.ssp_addr, &asoc->peer.primary_path->ipaddr, asoc->peer.primary_path->af_specific->sockaddr_len); - sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, + sctp_get_pf_specific(sk->sk_family)->addr_to_user(sp, (union sctp_addr *)&prim.ssp_addr); if (put_user(len, optlen)) @@ -4964,14 +5075,14 @@ static int sctp_getsockopt_default_send_param(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct sctp_sndrcvinfo info; - struct sctp_association *asoc; struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndrcvinfo info; - if (len < sizeof(struct sctp_sndrcvinfo)) + if (len < sizeof(info)) return -EINVAL; - len = sizeof(struct sctp_sndrcvinfo); + len = sizeof(info); if (copy_from_user(&info, optval, len)) return -EFAULT; @@ -4979,7 +5090,6 @@ static int sctp_getsockopt_default_send_param(struct sock *sk, asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - if (asoc) { info.sinfo_stream = asoc->default_stream; info.sinfo_flags = asoc->default_flags; @@ -5002,6 +5112,48 @@ static int sctp_getsockopt_default_send_param(struct sock *sk, return 0; } +/* RFC6458, Section 8.1.31. Set/get Default Send Parameters + * (SCTP_DEFAULT_SNDINFO) + */ +static int sctp_getsockopt_default_sndinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_association *asoc; + struct sctp_sndinfo info; + + if (len < sizeof(info)) + return -EINVAL; + + len = sizeof(info); + + if (copy_from_user(&info, optval, len)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, info.snd_assoc_id); + if (!asoc && info.snd_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + if (asoc) { + info.snd_sid = asoc->default_stream; + info.snd_flags = asoc->default_flags; + info.snd_ppid = asoc->default_ppid; + info.snd_context = asoc->default_context; + } else { + info.snd_sid = sp->default_stream; + info.snd_flags = sp->default_flags; + info.snd_ppid = sp->default_ppid; + info.snd_context = sp->default_context; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + + return 0; +} + /* * * 7.1.5 SCTP_NODELAY @@ -5752,6 +5904,46 @@ static int sctp_getsockopt_assoc_stats(struct sock *sk, int len, return 0; } +static int sctp_getsockopt_recvrcvinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + int val = 0; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + if (sctp_sk(sk)->recvrcvinfo) + val = 1; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +static int sctp_getsockopt_recvnxtinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + int val = 0; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + if (sctp_sk(sk)->recvnxtinfo) + val = 1; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -5821,6 +6013,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_default_send_param(sk, len, optval, optlen); break; + case SCTP_DEFAULT_SNDINFO: + retval = sctp_getsockopt_default_sndinfo(sk, len, + optval, optlen); + break; case SCTP_PRIMARY_ADDR: retval = sctp_getsockopt_primary_addr(sk, len, optval, optlen); break; @@ -5895,6 +6091,12 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_GET_ASSOC_STATS: retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen); break; + case SCTP_RECVRCVINFO: + retval = sctp_getsockopt_recvrcvinfo(sk, len, optval, optlen); + break; + case SCTP_RECVNXTINFO: + retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6390,8 +6592,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) struct cmsghdr *cmsg; struct msghdr *my_msg = (struct msghdr *)msg; - for (cmsg = CMSG_FIRSTHDR(msg); - cmsg != NULL; + for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; cmsg = CMSG_NXTHDR(my_msg, cmsg)) { if (!CMSG_OK(my_msg, cmsg)) return -EINVAL; @@ -6404,7 +6605,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) switch (cmsg->cmsg_type) { case SCTP_INIT: /* SCTP Socket API Extension - * 5.2.1 SCTP Initiation Structure (SCTP_INIT) + * 5.3.1 SCTP Initiation Structure (SCTP_INIT) * * This cmsghdr structure provides information for * initializing new SCTP associations with sendmsg(). @@ -6416,15 +6617,15 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_INIT struct sctp_initmsg */ - if (cmsg->cmsg_len != - CMSG_LEN(sizeof(struct sctp_initmsg))) + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_initmsg))) return -EINVAL; - cmsgs->init = (struct sctp_initmsg *)CMSG_DATA(cmsg); + + cmsgs->init = CMSG_DATA(cmsg); break; case SCTP_SNDRCV: /* SCTP Socket API Extension - * 5.2.2 SCTP Header Information Structure(SCTP_SNDRCV) + * 5.3.2 SCTP Header Information Structure(SCTP_SNDRCV) * * This cmsghdr structure specifies SCTP options for * sendmsg() and describes SCTP header information @@ -6434,24 +6635,44 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs) * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_SNDRCV struct sctp_sndrcvinfo */ - if (cmsg->cmsg_len != - CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) return -EINVAL; - cmsgs->info = - (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + cmsgs->srinfo = CMSG_DATA(cmsg); - /* Minimally, validate the sinfo_flags. */ - if (cmsgs->info->sinfo_flags & + if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; + case SCTP_SNDINFO: + /* SCTP Socket API Extension + * 5.3.4 SCTP Send Information Structure (SCTP_SNDINFO) + * + * This cmsghdr structure specifies SCTP options for + * sendmsg(). This structure and SCTP_RCVINFO replaces + * SCTP_SNDRCV which has been deprecated. + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_SNDINFO struct sctp_sndinfo + */ + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndinfo))) + return -EINVAL; + + cmsgs->sinfo = CMSG_DATA(cmsg); + + if (cmsgs->sinfo->snd_flags & + ~(SCTP_UNORDERED | SCTP_ADDR_OVER | + SCTP_ABORT | SCTP_EOF)) + return -EINVAL; + break; default: return -EINVAL; } } + return 0; } @@ -6518,8 +6739,8 @@ out: * Note: This is pretty much the same routine as in core/datagram.c * with a few changes to make lksctp work. */ -static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, - int noblock, int *err) +struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, + int noblock, int *err) { int error; struct sk_buff *skb; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 12c7e01c2677..2e9ada10fd84 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -424,8 +424,9 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - pr_warn_once("Changing rto_alpha or rto_beta may lead to " - "suboptimal rtt/srtt estimations!\n"); + if (write) + pr_warn_once("Changing rto_alpha or rto_beta may lead to " + "suboptimal rtt/srtt estimations!\n"); return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); } diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 7dd672fa651f..a0a431824f63 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -289,8 +289,8 @@ void sctp_transport_route(struct sctp_transport *transport, */ if (asoc && (!asoc->peer.primary_path || (transport == asoc->peer.active_path))) - opt->pf->af->to_sk_saddr(&transport->saddr, - asoc->base.sk); + opt->pf->to_sk_saddr(&transport->saddr, + asoc->base.sk); } else transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } @@ -594,15 +594,16 @@ void sctp_transport_burst_reset(struct sctp_transport *t) } /* What is the next timeout value for this transport? */ -unsigned long sctp_transport_timeout(struct sctp_transport *t) +unsigned long sctp_transport_timeout(struct sctp_transport *trans) { - unsigned long timeout; - timeout = t->rto + sctp_jitter(t->rto); - if ((t->state != SCTP_UNCONFIRMED) && - (t->state != SCTP_PF)) - timeout += t->hbinterval; - timeout += jiffies; - return timeout; + /* RTO + timer slack +/- 50% of RTO */ + unsigned long timeout = (trans->rto >> 1) + prandom_u32_max(trans->rto); + + if (trans->state != SCTP_UNCONFIRMED && + trans->state != SCTP_PF) + timeout += trans->hbinterval; + + return timeout + jiffies; } /* Reset transport variables to their initial values */ diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index b6842fdb53d4..d1e38308f615 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -341,7 +341,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( memcpy(&spc->spc_aaddr, aaddr, sizeof(struct sockaddr_storage)); /* Map ipv4 address into v4-mapped-on-v6 address. */ - sctp_get_pf_specific(asoc->base.sk->sk_family)->addr_v4map( + sctp_get_pf_specific(asoc->base.sk->sk_family)->addr_to_user( sctp_sk(asoc->base.sk), (union sctp_addr *)&spc->spc_aaddr); @@ -886,6 +886,69 @@ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, sizeof(sinfo), &sinfo); } +/* RFC6458, Section 5.3.5 SCTP Receive Information Structure + * (SCTP_SNDRCV) + */ +void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, + struct msghdr *msghdr) +{ + struct sctp_rcvinfo rinfo; + + if (sctp_ulpevent_is_notification(event)) + return; + + memset(&rinfo, 0, sizeof(struct sctp_rcvinfo)); + rinfo.rcv_sid = event->stream; + rinfo.rcv_ssn = event->ssn; + rinfo.rcv_ppid = event->ppid; + rinfo.rcv_flags = event->flags; + rinfo.rcv_tsn = event->tsn; + rinfo.rcv_cumtsn = event->cumtsn; + rinfo.rcv_assoc_id = sctp_assoc2id(event->asoc); + rinfo.rcv_context = event->asoc->default_rcv_context; + + put_cmsg(msghdr, IPPROTO_SCTP, SCTP_RCVINFO, + sizeof(rinfo), &rinfo); +} + +/* RFC6458, Section 5.3.6. SCTP Next Receive Information Structure + * (SCTP_NXTINFO) + */ +static void __sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, + struct msghdr *msghdr, + const struct sk_buff *skb) +{ + struct sctp_nxtinfo nxtinfo; + + memset(&nxtinfo, 0, sizeof(nxtinfo)); + nxtinfo.nxt_sid = event->stream; + nxtinfo.nxt_ppid = event->ppid; + nxtinfo.nxt_flags = event->flags; + if (sctp_ulpevent_is_notification(event)) + nxtinfo.nxt_flags |= SCTP_NOTIFICATION; + nxtinfo.nxt_length = skb->len; + nxtinfo.nxt_assoc_id = sctp_assoc2id(event->asoc); + + put_cmsg(msghdr, IPPROTO_SCTP, SCTP_NXTINFO, + sizeof(nxtinfo), &nxtinfo); +} + +void sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, + struct msghdr *msghdr, + struct sock *sk) +{ + struct sk_buff *skb; + int err; + + skb = sctp_skb_recv_datagram(sk, MSG_PEEK, 1, &err); + if (skb != NULL) { + __sctp_ulpevent_read_nxtinfo(sctp_skb2event(skb), + msghdr, skb); + /* Just release refcount here. */ + kfree_skb(skb); + } +} + /* Do accounting for bytes received and hold a reference to the association * for each skb. */ diff --git a/net/socket.c b/net/socket.c index abf56b2a14f9..fe20c319a0bb 100644 --- a/net/socket.c +++ b/net/socket.c @@ -106,6 +106,7 @@ #include <linux/sockios.h> #include <linux/atalk.h> #include <net/busy_poll.h> +#include <linux/errqueue.h> #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; @@ -609,17 +610,25 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) +void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) { - *tx_flags = 0; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) - *tx_flags |= SKBTX_HW_TSTAMP; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) - *tx_flags |= SKBTX_SW_TSTAMP; - if (sock_flag(sk, SOCK_WIFI_STATUS)) - *tx_flags |= SKBTX_WIFI_STATUS; + u8 flags = *tx_flags; + + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_HARDWARE) + flags |= SKBTX_HW_TSTAMP; + + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) + flags |= SKBTX_SW_TSTAMP; + + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED) + flags |= SKBTX_SCHED_TSTAMP; + + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK) + flags |= SKBTX_ACK_TSTAMP; + + *tx_flags = flags; } -EXPORT_SYMBOL(sock_tx_timestamp); +EXPORT_SYMBOL(__sock_tx_timestamp); static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) @@ -697,7 +706,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); - struct timespec ts[3]; + struct scm_timestamping tss; int empty = 1; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); @@ -714,28 +723,24 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv); } else { - skb_get_timestampns(skb, &ts[0]); + struct timespec ts; + skb_get_timestampns(skb, &ts); put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, - sizeof(ts[0]), &ts[0]); + sizeof(ts), &ts); } } - - memset(ts, 0, sizeof(ts)); - if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE) && - ktime_to_timespec_cond(skb->tstamp, ts + 0)) + memset(&tss, 0, sizeof(tss)); + if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) && + ktime_to_timespec_cond(skb->tstamp, tss.ts + 0)) + empty = 0; + if (shhwtstamps && + (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && + ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) empty = 0; - if (shhwtstamps) { - if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->syststamp, ts + 1)) - empty = 0; - if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts + 2)) - empty = 0; - } if (!empty) put_cmsg(msg, SOL_SOCKET, - SCM_TIMESTAMPING, sizeof(ts), &ts); + SCM_TIMESTAMPING, sizeof(tss), &tss); } EXPORT_SYMBOL_GPL(__sock_recv_timestamp); @@ -1060,7 +1065,8 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = -EFAULT; if (get_user(pid, (int __user *)argp)) break; - err = f_setown(sock->file, pid, 1); + f_setown(sock->file, pid, 1); + err = 0; break; case FIOGETOWN: case SIOCGPGRP: @@ -1988,6 +1994,9 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, if (copy_from_user(kmsg, umsg, sizeof(struct msghdr))) return -EFAULT; + if (kmsg->msg_name == NULL) + kmsg->msg_namelen = 0; + if (kmsg->msg_namelen < 0) return -EINVAL; @@ -2593,7 +2602,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) * * This function is called by a protocol handler that wants to * advertise its address family, and have it linked into the - * socket interface. The value ops->family coresponds to the + * socket interface. The value ops->family corresponds to the * socket system call protocol family. */ int sock_register(const struct net_proto_family *ops) diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index a622ad64acd8..2e0a6f92e563 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -176,7 +176,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, len = (buf + buflen) - delim - 1; p = kstrndup(delim + 1, len, GFP_KERNEL); if (p) { - unsigned long scope_id = 0; + u32 scope_id = 0; struct net_device *dev; dev = dev_get_by_name(net, p); @@ -184,7 +184,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, scope_id = dev->ifindex; dev_put(dev); } else { - if (strict_strtoul(p, 10, &scope_id) == 0) { + if (kstrtou32(p, 10, &scope_id) == 0) { kfree(p); return 0; } @@ -304,7 +304,7 @@ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags) * @sap: buffer into which to plant socket address * @salen: size of buffer * - * @uaddr does not have to be '\0'-terminated, but strict_strtoul() and + * @uaddr does not have to be '\0'-terminated, but kstrtou8() and * rpc_pton() require proper string termination to be successful. * * Returns the size of the socket address if successful; otherwise @@ -315,7 +315,7 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr, const size_t salen) { char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')]; - unsigned long portlo, porthi; + u8 portlo, porthi; unsigned short port; if (uaddr_len > RPCBIND_MAXUADDRLEN) @@ -327,18 +327,14 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr, c = strrchr(buf, '.'); if (unlikely(c == NULL)) return 0; - if (unlikely(strict_strtoul(c + 1, 10, &portlo) != 0)) - return 0; - if (unlikely(portlo > 255)) + if (unlikely(kstrtou8(c + 1, 10, &portlo) != 0)) return 0; *c = '\0'; c = strrchr(buf, '.'); if (unlikely(c == NULL)) return 0; - if (unlikely(strict_strtoul(c + 1, 10, &porthi) != 0)) - return 0; - if (unlikely(porthi > 255)) + if (unlikely(kstrtou8(c + 1, 10, &porthi) != 0)) return 0; port = (unsigned short)((porthi << 8) | portlo); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index f77366717420..383eb919ac0b 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -48,7 +48,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) if (!val) goto out_inval; - ret = strict_strtoul(val, 0, &num); + ret = kstrtoul(val, 0, &num); if (ret == -EINVAL) goto out_inval; nbits = fls(num); @@ -80,6 +80,10 @@ static struct kernel_param_ops param_ops_hashtbl_sz = { module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644); MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size"); +static unsigned long auth_max_cred_cachesize = ULONG_MAX; +module_param(auth_max_cred_cachesize, ulong, 0644); +MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size"); + static u32 pseudoflavor_to_flavor(u32 flavor) { if (flavor > RPC_AUTH_MAXFLAVOR) @@ -363,6 +367,15 @@ rpcauth_cred_key_to_expire(struct rpc_cred *cred) } EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire); +char * +rpcauth_stringify_acceptor(struct rpc_cred *cred) +{ + if (!cred->cr_ops->crstringify_acceptor) + return NULL; + return cred->cr_ops->crstringify_acceptor(cred); +} +EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor); + /* * Destroy a list of credentials */ @@ -472,6 +485,20 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) return freed; } +static unsigned long +rpcauth_cache_do_shrink(int nr_to_scan) +{ + LIST_HEAD(free); + unsigned long freed; + + spin_lock(&rpc_credcache_lock); + freed = rpcauth_prune_expired(&free, nr_to_scan); + spin_unlock(&rpc_credcache_lock); + rpcauth_destroy_credlist(&free); + + return freed; +} + /* * Run memory cache shrinker. */ @@ -479,9 +506,6 @@ static unsigned long rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - LIST_HEAD(free); - unsigned long freed; - if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) return SHRINK_STOP; @@ -489,12 +513,7 @@ rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) if (list_empty(&cred_unused)) return SHRINK_STOP; - spin_lock(&rpc_credcache_lock); - freed = rpcauth_prune_expired(&free, sc->nr_to_scan); - spin_unlock(&rpc_credcache_lock); - rpcauth_destroy_credlist(&free); - - return freed; + return rpcauth_cache_do_shrink(sc->nr_to_scan); } static unsigned long @@ -504,6 +523,21 @@ rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) return (number_cred_unused / 100) * sysctl_vfs_cache_pressure; } +static void +rpcauth_cache_enforce_limit(void) +{ + unsigned long diff; + unsigned int nr_to_scan; + + if (number_cred_unused <= auth_max_cred_cachesize) + return; + diff = number_cred_unused - auth_max_cred_cachesize; + nr_to_scan = 100; + if (diff < nr_to_scan) + nr_to_scan = diff; + rpcauth_cache_do_shrink(nr_to_scan); +} + /* * Look up a process' credentials in the authentication cache */ @@ -523,6 +557,12 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; + if (flags & RPCAUTH_LOOKUP_RCU) { + if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) && + !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags)) + cred = entry; + break; + } spin_lock(&cache->lock); if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) { spin_unlock(&cache->lock); @@ -537,6 +577,9 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (cred != NULL) goto found; + if (flags & RPCAUTH_LOOKUP_RCU) + return ERR_PTR(-ECHILD); + new = auth->au_ops->crcreate(auth, acred, flags); if (IS_ERR(new)) { cred = new; @@ -557,6 +600,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, } else list_add_tail(&new->cr_lru, &free); spin_unlock(&cache->lock); + rpcauth_cache_enforce_limit(); found: if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && cred->cr_ops->cr_init != NULL && @@ -586,10 +630,8 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags) memset(&acred, 0, sizeof(acred)); acred.uid = cred->fsuid; acred.gid = cred->fsgid; - acred.group_info = get_group_info(((struct cred *)cred)->group_info); - + acred.group_info = cred->group_info; ret = auth->au_ops->lookup_cred(auth, &acred, flags); - put_group_info(acred.group_info); return ret; } EXPORT_SYMBOL_GPL(rpcauth_lookupcred); diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index ed04869b2d4f..6f6b829c9e8e 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -38,6 +38,12 @@ struct rpc_cred *rpc_lookup_cred(void) } EXPORT_SYMBOL_GPL(rpc_lookup_cred); +struct rpc_cred *rpc_lookup_cred_nonblock(void) +{ + return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU); +} +EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock); + /* * Public call interface for looking up machine creds. */ diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index b6e440baccc3..afb292cd797d 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -183,8 +183,9 @@ gss_cred_get_ctx(struct rpc_cred *cred) struct gss_cl_ctx *ctx = NULL; rcu_read_lock(); - if (gss_cred->gc_ctx) - ctx = gss_get_ctx(gss_cred->gc_ctx); + ctx = rcu_dereference(gss_cred->gc_ctx); + if (ctx) + gss_get_ctx(ctx); rcu_read_unlock(); return ctx; } @@ -262,9 +263,22 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct p = ERR_PTR(ret); goto err; } - dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u\n", - __func__, ctx->gc_expiry, now, timeout); - return q; + + /* is there any trailing data? */ + if (q == end) { + p = q; + goto done; + } + + /* pull in acceptor name (if there is one) */ + p = simple_get_netobj(q, end, &ctx->gc_acceptor); + if (IS_ERR(p)) + goto err; +done: + dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u acceptor %.*s\n", + __func__, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len, + ctx->gc_acceptor.data); + return p; err: dprintk("RPC: %s returns error %ld\n", __func__, -PTR_ERR(p)); return p; @@ -1194,13 +1208,13 @@ gss_destroying_context(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); + struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); struct rpc_task *task; - if (gss_cred->gc_ctx == NULL || - test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) + if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) return 0; - gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY; + ctx->gc_proc = RPC_GSS_PROC_DESTROY; cred->cr_ops = &gss_nullops; /* Take a reference to ensure the cred will be destroyed either @@ -1225,6 +1239,7 @@ gss_do_free_ctx(struct gss_cl_ctx *ctx) gss_delete_sec_context(&ctx->gc_gss_ctx); kfree(ctx->gc_wire_ctx.data); + kfree(ctx->gc_acceptor.data); kfree(ctx); } @@ -1260,7 +1275,7 @@ gss_destroy_nullcred(struct rpc_cred *cred) { struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth); - struct gss_cl_ctx *ctx = gss_cred->gc_ctx; + struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); RCU_INIT_POINTER(gss_cred->gc_ctx, NULL); call_rcu(&cred->cr_rcu, gss_free_cred_callback); @@ -1332,6 +1347,36 @@ gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred) return err; } +static char * +gss_stringify_acceptor(struct rpc_cred *cred) +{ + char *string = NULL; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + struct gss_cl_ctx *ctx; + struct xdr_netobj *acceptor; + + rcu_read_lock(); + ctx = rcu_dereference(gss_cred->gc_ctx); + if (!ctx) + goto out; + + acceptor = &ctx->gc_acceptor; + + /* no point if there's no string */ + if (!acceptor->len) + goto out; + + string = kmalloc(acceptor->len + 1, GFP_KERNEL); + if (!string) + goto out; + + memcpy(string, acceptor->data, acceptor->len); + string[acceptor->len] = '\0'; +out: + rcu_read_unlock(); + return string; +} + /* * Returns -EACCES if GSS context is NULL or will expire within the * timeout (miliseconds) @@ -1340,15 +1385,16 @@ static int gss_key_timeout(struct rpc_cred *rc) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); + struct gss_cl_ctx *ctx; unsigned long now = jiffies; unsigned long expire; - if (gss_cred->gc_ctx == NULL) - return -EACCES; - - expire = gss_cred->gc_ctx->gc_expiry - (gss_key_expire_timeo * HZ); - - if (time_after(now, expire)) + rcu_read_lock(); + ctx = rcu_dereference(gss_cred->gc_ctx); + if (ctx) + expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ); + rcu_read_unlock(); + if (!ctx || time_after(now, expire)) return -EACCES; return 0; } @@ -1357,13 +1403,19 @@ static int gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags) { struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); + struct gss_cl_ctx *ctx; int ret; if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags)) goto out; /* Don't match with creds that have expired. */ - if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry)) + rcu_read_lock(); + ctx = rcu_dereference(gss_cred->gc_ctx); + if (!ctx || time_after(jiffies, ctx->gc_expiry)) { + rcu_read_unlock(); return 0; + } + rcu_read_unlock(); if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags)) return 0; out: @@ -1909,29 +1961,31 @@ static const struct rpc_authops authgss_ops = { }; static const struct rpc_credops gss_credops = { - .cr_name = "AUTH_GSS", - .crdestroy = gss_destroy_cred, - .cr_init = gss_cred_init, - .crbind = rpcauth_generic_bind_cred, - .crmatch = gss_match, - .crmarshal = gss_marshal, - .crrefresh = gss_refresh, - .crvalidate = gss_validate, - .crwrap_req = gss_wrap_req, - .crunwrap_resp = gss_unwrap_resp, - .crkey_timeout = gss_key_timeout, + .cr_name = "AUTH_GSS", + .crdestroy = gss_destroy_cred, + .cr_init = gss_cred_init, + .crbind = rpcauth_generic_bind_cred, + .crmatch = gss_match, + .crmarshal = gss_marshal, + .crrefresh = gss_refresh, + .crvalidate = gss_validate, + .crwrap_req = gss_wrap_req, + .crunwrap_resp = gss_unwrap_resp, + .crkey_timeout = gss_key_timeout, + .crstringify_acceptor = gss_stringify_acceptor, }; static const struct rpc_credops gss_nullops = { - .cr_name = "AUTH_GSS", - .crdestroy = gss_destroy_nullcred, - .crbind = rpcauth_generic_bind_cred, - .crmatch = gss_match, - .crmarshal = gss_marshal, - .crrefresh = gss_refresh_null, - .crvalidate = gss_validate, - .crwrap_req = gss_wrap_req, - .crunwrap_resp = gss_unwrap_resp, + .cr_name = "AUTH_GSS", + .crdestroy = gss_destroy_nullcred, + .crbind = rpcauth_generic_bind_cred, + .crmatch = gss_match, + .crmarshal = gss_marshal, + .crrefresh = gss_refresh_null, + .crvalidate = gss_validate, + .crwrap_req = gss_wrap_req, + .crunwrap_resp = gss_unwrap_resp, + .crstringify_acceptor = gss_stringify_acceptor, }; static const struct rpc_pipe_ops gss_upcall_ops_v0 = { diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 0f43e894bc0a..f5ed9f6ece06 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -641,7 +641,7 @@ out: u32 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, - struct xdr_buf *buf, int ec, struct page **pages) + struct xdr_buf *buf, struct page **pages) { u32 err; struct xdr_netobj hmac; @@ -684,13 +684,8 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, ecptr = buf->tail[0].iov_base; } - memset(ecptr, 'X', ec); - buf->tail[0].iov_len += ec; - buf->len += ec; - /* copy plaintext gss token header after filler (if any) */ - memcpy(ecptr + ec, buf->head[0].iov_base + offset, - GSS_KRB5_TOK_HDR_LEN); + memcpy(ecptr, buf->head[0].iov_base + offset, GSS_KRB5_TOK_HDR_LEN); buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN; buf->len += GSS_KRB5_TOK_HDR_LEN; diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 62ae3273186c..42768e5c3994 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -70,31 +70,37 @@ DEFINE_SPINLOCK(krb5_seq_lock); -static char * +static void * setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) { - __be16 *ptr, *krb5_hdr; + u16 *ptr; + void *krb5_hdr; int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; token->len = g_token_size(&ctx->mech_used, body_size); - ptr = (__be16 *)token->data; + ptr = (u16 *)token->data; g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr); /* ptr now at start of header described in rfc 1964, section 1.2.1: */ krb5_hdr = ptr; *ptr++ = KG_TOK_MIC_MSG; - *ptr++ = cpu_to_le16(ctx->gk5e->signalg); + /* + * signalg is stored as if it were converted from LE to host endian, even + * though it's an opaque pair of bytes according to the RFC. + */ + *ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg); *ptr++ = SEAL_ALG_NONE; - *ptr++ = 0xffff; + *ptr = 0xffff; - return (char *)krb5_hdr; + return krb5_hdr; } static void * setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) { - __be16 *ptr, *krb5_hdr; + u16 *ptr; + void *krb5_hdr; u8 *p, flags = 0x00; if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0) @@ -104,15 +110,15 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) /* Per rfc 4121, sec 4.2.6.1, there is no header, * just start the token */ - krb5_hdr = ptr = (__be16 *)token->data; + krb5_hdr = ptr = (u16 *)token->data; *ptr++ = KG2_TOK_MIC; p = (u8 *)ptr; *p++ = flags; *p++ = 0xff; - ptr = (__be16 *)p; - *ptr++ = 0xffff; + ptr = (u16 *)p; *ptr++ = 0xffff; + *ptr = 0xffff; token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength; return krb5_hdr; @@ -181,7 +187,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, spin_lock(&krb5_seq_lock); seq_send = ctx->seq_send64++; spin_unlock(&krb5_seq_lock); - *((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send); + *((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send); if (ctx->initiate) { cksumkey = ctx->initiator_sign; diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 42560e55d978..4b614c604fe0 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -201,9 +201,15 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength; - *(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); - memset(ptr + 4, 0xff, 4); - *(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); + /* + * signalg and sealalg are stored as if they were converted from LE + * to host endian, even though they're opaque pairs of bytes according + * to the RFC. + */ + *(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg); + *(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg); + ptr[6] = 0xff; + ptr[7] = 0xff; gss_krb5_make_confounder(msg_start, conflen); @@ -438,7 +444,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, u8 *ptr, *plainhdr; s32 now; u8 flags = 0x00; - __be16 *be16ptr, ec = 0; + __be16 *be16ptr; __be64 *be64ptr; u32 err; @@ -468,16 +474,16 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, be16ptr = (__be16 *)ptr; blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc); - *be16ptr++ = cpu_to_be16(ec); + *be16ptr++ = 0; /* "inner" token header always uses 0 for RRC */ - *be16ptr++ = cpu_to_be16(0); + *be16ptr++ = 0; be64ptr = (__be64 *)be16ptr; spin_lock(&krb5_seq_lock); *be64ptr = cpu_to_be64(kctx->seq_send64++); spin_unlock(&krb5_seq_lock); - err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages); + err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages); if (err) return err; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 4ce5eccec1f6..c548ab213f76 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -886,7 +886,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs u32 priv_len, maj_stat; int pad, saved_len, remaining_len, offset; - rqstp->rq_splice_ok = 0; + rqstp->rq_splice_ok = false; priv_len = svc_getnl(&buf->head[0]); if (rqstp->rq_deferred) { diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index f0ebe07978a2..712c123e04e9 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -35,6 +35,8 @@ nul_destroy(struct rpc_auth *auth) static struct rpc_cred * nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { + if (flags & RPCAUTH_LOOKUP_RCU) + return &null_cred; return get_rpccred(&null_cred); } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2e6ab10734f6..9acd6ce88db7 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -461,6 +461,8 @@ struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, if (args->flags & RPC_CLNT_CREATE_AUTOBIND) clnt->cl_autobind = 1; + if (args->flags & RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT) + clnt->cl_noretranstimeo = 1; if (args->flags & RPC_CLNT_CREATE_DISCRTRY) clnt->cl_discrtry = 1; if (!(args->flags & RPC_CLNT_CREATE_QUIET)) @@ -579,6 +581,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, /* Turn off autobind on clones */ new->cl_autobind = 0; new->cl_softrtry = clnt->cl_softrtry; + new->cl_noretranstimeo = clnt->cl_noretranstimeo; new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; return new; @@ -1746,6 +1749,7 @@ call_bind_status(struct rpc_task *task) case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + case -ENOBUFS: case -EPIPE: dprintk("RPC: %5u remote rpcbind unreachable: %d\n", task->tk_pid, task->tk_status); @@ -1812,6 +1816,8 @@ call_connect_status(struct rpc_task *task) case -ECONNABORTED: case -ENETUNREACH: case -EHOSTUNREACH: + case -ENOBUFS: + case -EPIPE: if (RPC_IS_SOFTCONN(task)) break; /* retry with existing socket, after a delay */ @@ -1910,6 +1916,7 @@ call_transmit_status(struct rpc_task *task) case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + case -EPERM: if (RPC_IS_SOFTCONN(task)) { xprt_end_transmit(task); rpc_exit(task, task->tk_status); @@ -1918,6 +1925,7 @@ call_transmit_status(struct rpc_task *task) case -ECONNRESET: case -ECONNABORTED: case -ENOTCONN: + case -ENOBUFS: case -EPIPE: rpc_task_force_reencode(task); } @@ -2014,6 +2022,7 @@ call_status(struct rpc_task *task) case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + case -EPERM: if (RPC_IS_SOFTCONN(task)) { rpc_exit(task, status); break; @@ -2034,6 +2043,7 @@ call_status(struct rpc_task *task) case -ECONNRESET: case -ECONNABORTED: rpc_force_rebind(clnt); + case -ENOBUFS: rpc_delay(task, 3*HZ); case -EPIPE: case -ENOTCONN: diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index b18554898562..2d12b76b5a64 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -195,7 +195,7 @@ static struct inode * rpc_alloc_inode(struct super_block *sb) { struct rpc_inode *rpci; - rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL); + rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL); if (!rpci) return NULL; return &rpci->vfs_inode; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index c0365c14b858..fe3441abdbe5 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -250,7 +250,7 @@ void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) } EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); -static int rpc_wait_bit_killable(void *word) +static int rpc_wait_bit_killable(struct wait_bit_key *key) { if (fatal_signal_pending(current)) return -ERESTARTSYS; @@ -309,7 +309,7 @@ static int rpc_complete_task(struct rpc_task *task) * to enforce taking of the wq->lock and hence avoid races with * rpc_complete_task(). */ -int __rpc_wait_for_completion_task(struct rpc_task *task, int (*action)(void *)) +int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action) { if (action == NULL) action = rpc_wait_bit_killable; @@ -821,9 +821,7 @@ void rpc_execute(struct rpc_task *task) static void rpc_async_schedule(struct work_struct *work) { - current->flags |= PF_FSTRANS; __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); - current->flags &= ~PF_FSTRANS; } /** diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5de6801cd924..ca8a7958f4e6 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -612,8 +612,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp) goto out_enomem; - init_waitqueue_head(&rqstp->rq_wait); - serv->sv_nrthreads++; spin_lock_bh(&pool->sp_lock); pool->sp_nrthreads++; @@ -1086,9 +1084,9 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto err_short_len; /* Will be turned off only in gss privacy case: */ - rqstp->rq_splice_ok = 1; + rqstp->rq_splice_ok = true; /* Will be turned off only when NFSv4 Sessions are used */ - rqstp->rq_usedeferral = 1; + rqstp->rq_usedeferral = true; rqstp->rq_dropme = false; /* Setup reply header */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b4737fbdec13..c179ca2a5aa4 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -23,6 +23,7 @@ static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); static void svc_age_temp_xprts(unsigned long closure); static void svc_delete_xprt(struct svc_xprt *xprt); +static void svc_xprt_do_enqueue(struct svc_xprt *xprt); /* apparently the "standard" is that clients close * idle connections after 5 minutes, servers after @@ -222,11 +223,12 @@ static void svc_xprt_received(struct svc_xprt *xprt) if (!test_bit(XPT_BUSY, &xprt->xpt_flags)) return; /* As soon as we clear busy, the xprt could be closed and - * 'put', so we need a reference to call svc_xprt_enqueue with: + * 'put', so we need a reference to call svc_xprt_do_enqueue with: */ svc_xprt_get(xprt); + smp_mb__before_atomic(); clear_bit(XPT_BUSY, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); + svc_xprt_do_enqueue(xprt); svc_xprt_put(xprt); } @@ -335,12 +337,7 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) return false; } -/* - * Queue up a transport with data pending. If there are idle nfsd - * processes, wake 'em up. - * - */ -void svc_xprt_enqueue(struct svc_xprt *xprt) +static void svc_xprt_do_enqueue(struct svc_xprt *xprt) { struct svc_pool *pool; struct svc_rqst *rqstp; @@ -349,20 +346,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) if (!svc_xprt_has_something_to_do(xprt)) return; - cpu = get_cpu(); - pool = svc_pool_for_cpu(xprt->xpt_server, cpu); - put_cpu(); - - spin_lock_bh(&pool->sp_lock); - - if (!list_empty(&pool->sp_threads) && - !list_empty(&pool->sp_sockets)) - printk(KERN_ERR - "svc_xprt_enqueue: " - "threads and transports both waiting??\n"); - - pool->sp_stats.packets++; - /* Mark transport as busy. It will remain in this state until * the provider calls svc_xprt_received. We update XPT_BUSY * atomically because it also guards against trying to enqueue @@ -371,9 +354,15 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { /* Don't enqueue transport while already enqueued */ dprintk("svc: transport %p busy, not enqueued\n", xprt); - goto out_unlock; + return; } + cpu = get_cpu(); + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); + spin_lock_bh(&pool->sp_lock); + + pool->sp_stats.packets++; + if (!list_empty(&pool->sp_threads)) { rqstp = list_entry(pool->sp_threads.next, struct svc_rqst, @@ -385,18 +374,35 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) printk(KERN_ERR "svc_xprt_enqueue: server %p, rq_xprt=%p!\n", rqstp, rqstp->rq_xprt); - rqstp->rq_xprt = xprt; + /* Note the order of the following 3 lines: + * We want to assign xprt to rqstp->rq_xprt only _after_ + * we've woken up the process, so that we don't race with + * the lockless check in svc_get_next_xprt(). + */ svc_xprt_get(xprt); + wake_up_process(rqstp->rq_task); + rqstp->rq_xprt = xprt; pool->sp_stats.threads_woken++; - wake_up(&rqstp->rq_wait); } else { dprintk("svc: transport %p put into queue\n", xprt); list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); pool->sp_stats.sockets_queued++; } -out_unlock: spin_unlock_bh(&pool->sp_lock); + put_cpu(); +} + +/* + * Queue up a transport with data pending. If there are idle nfsd + * processes, wake 'em up. + * + */ +void svc_xprt_enqueue(struct svc_xprt *xprt) +{ + if (test_bit(XPT_BUSY, &xprt->xpt_flags)) + return; + svc_xprt_do_enqueue(xprt); } EXPORT_SYMBOL_GPL(svc_xprt_enqueue); @@ -439,6 +445,8 @@ void svc_reserve(struct svc_rqst *rqstp, int space) atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); rqstp->rq_reserved = space; + if (xprt->xpt_ops->xpo_adjust_wspace) + xprt->xpt_ops->xpo_adjust_wspace(xprt); svc_xprt_enqueue(xprt); } } @@ -498,7 +506,7 @@ void svc_wake_up(struct svc_serv *serv) svc_thread_dequeue(pool, rqstp); rqstp->rq_xprt = NULL; */ - wake_up(&rqstp->rq_wait); + wake_up_process(rqstp->rq_task); } else pool->sp_task_pending = 1; spin_unlock_bh(&pool->sp_lock); @@ -617,8 +625,7 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) { struct svc_xprt *xprt; struct svc_pool *pool = rqstp->rq_pool; - DECLARE_WAITQUEUE(wait, current); - long time_left; + long time_left = 0; /* Normally we will wait up to 5 seconds for any required * cache information to be provided. @@ -640,40 +647,32 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) } else { if (pool->sp_task_pending) { pool->sp_task_pending = 0; - spin_unlock_bh(&pool->sp_lock); - return ERR_PTR(-EAGAIN); + xprt = ERR_PTR(-EAGAIN); + goto out; } - /* No data pending. Go to sleep */ - svc_thread_enqueue(pool, rqstp); - /* * We have to be able to interrupt this wait * to bring down the daemons ... */ set_current_state(TASK_INTERRUPTIBLE); - /* - * checking kthread_should_stop() here allows us to avoid - * locking and signalling when stopping kthreads that call - * svc_recv. If the thread has already been woken up, then - * we can exit here without sleeping. If not, then it - * it'll be woken up quickly during the schedule_timeout - */ - if (kthread_should_stop()) { - set_current_state(TASK_RUNNING); - spin_unlock_bh(&pool->sp_lock); - return ERR_PTR(-EINTR); - } - - add_wait_queue(&rqstp->rq_wait, &wait); + /* No data pending. Go to sleep */ + svc_thread_enqueue(pool, rqstp); spin_unlock_bh(&pool->sp_lock); - time_left = schedule_timeout(timeout); + if (!(signalled() || kthread_should_stop())) { + time_left = schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + + try_to_freeze(); - try_to_freeze(); + xprt = rqstp->rq_xprt; + if (xprt != NULL) + return xprt; + } else + __set_current_state(TASK_RUNNING); spin_lock_bh(&pool->sp_lock); - remove_wait_queue(&rqstp->rq_wait, &wait); if (!time_left) pool->sp_stats.threads_timedout++; @@ -688,6 +687,7 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) return ERR_PTR(-EAGAIN); } } +out: spin_unlock_bh(&pool->sp_lock); return xprt; } @@ -733,7 +733,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) svc_add_new_temp_xprt(serv, newxpt); else module_put(xprt->xpt_class->xcl_owner); - } else if (xprt->xpt_ops->xpo_has_wspace(xprt)) { + } else { /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", rqstp, rqstp->rq_pool->sp_id, xprt, @@ -770,10 +770,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) printk(KERN_ERR "svc_recv: service %p, transport not NULL!\n", rqstp); - if (waitqueue_active(&rqstp->rq_wait)) - printk(KERN_ERR - "svc_recv: service %p, wait queue active!\n", - rqstp); err = svc_alloc_arg(rqstp); if (err) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index b507cd327d9b..3f959c681885 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -312,19 +312,6 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) } /* - * Check input queue length - */ -static int svc_recv_available(struct svc_sock *svsk) -{ - struct socket *sock = svsk->sk_sock; - int avail, err; - - err = kernel_sock_ioctl(sock, TIOCINQ, (unsigned long) &avail); - - return (err >= 0)? avail : err; -} - -/* * Generic recvfrom routine. */ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, @@ -339,8 +326,14 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, rqstp->rq_xprt_hlen = 0; + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, msg.msg_flags); + /* If we read a full record, then assume there may be more + * data to read (stream based sockets only!) + */ + if (len == buflen) + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", svsk, iov[0].iov_base, iov[0].iov_len, len); @@ -446,15 +439,43 @@ static void svc_write_space(struct sock *sk) } } +static int svc_tcp_has_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_xprt.xpt_server; + int required; + + if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) + return 1; + required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg; + if (sk_stream_wspace(svsk->sk_sk) >= required || + (sk_stream_min_wspace(svsk->sk_sk) == 0 && + atomic_read(&xprt->xpt_reserved) == 0)) + return 1; + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return 0; +} + static void svc_tcp_write_space(struct sock *sk) { + struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); struct socket *sock = sk->sk_socket; - if (sk_stream_is_writeable(sk) && sock) + if (!sk_stream_is_writeable(sk) || !sock) + return; + if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt)) clear_bit(SOCK_NOSPACE, &sock->flags); svc_write_space(sk); } +static void svc_tcp_adjust_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + + if (svc_tcp_has_wspace(xprt)) + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); +} + /* * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo */ @@ -692,6 +713,7 @@ static struct svc_xprt_class svc_udp_class = { .xcl_owner = THIS_MODULE, .xcl_ops = &svc_udp_ops, .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP, + .xcl_ident = XPRT_TRANSPORT_UDP, }; static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) @@ -951,8 +973,6 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) unsigned int want; int len; - clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) { struct kvec iov; @@ -1007,7 +1027,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) "%s: Got unrecognized reply: " "calldir 0x%x xpt_bc_xprt %p xid %08x\n", __func__, ntohl(calldir), - bc_xprt, xid); + bc_xprt, ntohl(xid)); return -EAGAIN; } @@ -1044,8 +1064,6 @@ static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len) static void svc_tcp_fragment_received(struct svc_sock *svsk) { /* If we have more data, signal svc_xprt_enqueue() to try again */ - if (svc_recv_available(svsk) > sizeof(rpc_fraghdr)) - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); dprintk("svc: TCP %s record (%d bytes)\n", svc_sock_final_rec(svsk) ? "final" : "nonfinal", svc_sock_reclen(svsk)); @@ -1197,23 +1215,6 @@ static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) svc_putnl(resv, 0); } -static int svc_tcp_has_wspace(struct svc_xprt *xprt) -{ - struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - struct svc_serv *serv = svsk->sk_xprt.xpt_server; - int required; - - if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) - return 1; - required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg; - if (sk_stream_wspace(svsk->sk_sk) >= required || - (sk_stream_min_wspace(svsk->sk_sk) == 0 && - atomic_read(&xprt->xpt_reserved) == 0)) - return 1; - set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - return 0; -} - static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -1285,6 +1286,7 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, .xpo_secure_port = svc_sock_secure_port, + .xpo_adjust_wspace = svc_tcp_adjust_wspace, }; static struct svc_xprt_class svc_tcp_class = { @@ -1292,6 +1294,7 @@ static struct svc_xprt_class svc_tcp_class = { .xcl_owner = THIS_MODULE, .xcl_ops = &svc_tcp_ops, .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, + .xcl_ident = XPRT_TRANSPORT_TCP, }; void svc_init_xprt_sock(void) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 23fb4e75e245..290af97bf6f9 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -509,7 +509,8 @@ void xdr_commit_encode(struct xdr_stream *xdr) } EXPORT_SYMBOL_GPL(xdr_commit_encode); -__be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes) +static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, + size_t nbytes) { static __be32 *p; int space_left; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index c3b2b3369e52..56e4e150e80e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -744,6 +744,7 @@ static void xprt_connect_status(struct rpc_task *task) case -ECONNABORTED: case -ENETUNREACH: case -EHOSTUNREACH: + case -EPIPE: case -EAGAIN: dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); break; @@ -1306,7 +1307,7 @@ struct rpc_xprt *xprt_create_transport(struct xprt_create *args) } } spin_unlock(&xprt_list_lock); - printk(KERN_ERR "RPC: transport (%d) not supported\n", args->ident); + dprintk("RPC: transport (%d) not supported\n", args->ident); return ERR_PTR(-EIO); found: diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 693966d3f33b..6166c985fe24 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,14 +53,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - #ifdef RPC_DEBUG static const char transfertypes[][12] = { "pure inline", /* no chunks */ @@ -279,13 +271,37 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return (unsigned char *)iptr - (unsigned char *)headerp; out: - for (pos = 0; nchunks--;) - pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt); + if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { + for (pos = 0; nchunks--;) + pos += rpcrdma_deregister_external( + &req->rl_segments[pos], r_xprt); + } return n; } /* + * Marshal chunks. This routine returns the header length + * consumed by marshaling. + * + * Returns positive RPC/RDMA header size, or negative errno. + */ + +ssize_t +rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) +{ + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base; + + if (req->rl_rtype != rpcrdma_noch) + result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, + headerp, req->rl_rtype); + else if (req->rl_wtype != rpcrdma_noch) + result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, + headerp, req->rl_wtype); + return result; +} + +/* * Copy write data inline. * This function is used for "small" requests. Data which is passed * to RPC via iovecs (or page list) is copied directly into the @@ -377,7 +393,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) char *base; size_t rpclen, padlen; ssize_t hdrlen; - enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; /* @@ -415,13 +430,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * into pages; otherwise use reply chunks. */ if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - wtype = rpcrdma_noch; + req->rl_wtype = rpcrdma_noch; else if (rqst->rq_rcv_buf.page_len == 0) - wtype = rpcrdma_replych; + req->rl_wtype = rpcrdma_replych; else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - wtype = rpcrdma_writech; + req->rl_wtype = rpcrdma_writech; else - wtype = rpcrdma_replych; + req->rl_wtype = rpcrdma_replych; /* * Chunks needed for arguments? @@ -438,16 +453,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * TBD check NFSv4 setacl */ if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - rtype = rpcrdma_noch; + req->rl_rtype = rpcrdma_noch; else if (rqst->rq_snd_buf.page_len == 0) - rtype = rpcrdma_areadch; + req->rl_rtype = rpcrdma_areadch; else - rtype = rpcrdma_readch; + req->rl_rtype = rpcrdma_readch; /* The following simplification is not true forever */ - if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) - wtype = rpcrdma_noch; - if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { + if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) + req->rl_wtype = rpcrdma_noch; + if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { dprintk("RPC: %s: cannot marshal multiple chunk lists\n", __func__); return -EIO; @@ -461,7 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * When padding is in use and applies to the transfer, insert * it and change the message type. */ - if (rtype == rpcrdma_noch) { + if (req->rl_rtype == rpcrdma_noch) { padlen = rpcrdma_inline_pullup(rqst, RPCRDMA_INLINE_PAD_VALUE(rqst)); @@ -476,7 +491,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (wtype != rpcrdma_noch) { + if (req->rl_wtype != rpcrdma_noch) { dprintk("RPC: %s: invalid chunk list\n", __func__); return -EIO; @@ -497,30 +512,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (wtype == rpcrdma_noch) - wtype = rpcrdma_replych; + if (req->rl_wtype == rpcrdma_noch) + req->rl_wtype = rpcrdma_replych; } } - /* - * Marshal chunks. This routine will return the header length - * consumed by marshaling. - */ - if (rtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, - &rqst->rq_snd_buf, headerp, rtype); - wtype = rtype; /* simplify dprintk */ - - } else if (wtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, - &rqst->rq_rcv_buf, headerp, wtype); - } + hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); if (hdrlen < 0) return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, headerp, base, req->rl_iov.lkey); /* diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 8f92a61ee2df..e0110270d650 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -43,6 +43,7 @@ #include <linux/sunrpc/debug.h> #include <linux/sunrpc/rpc_rdma.h> #include <linux/spinlock.h> +#include <linux/highmem.h> #include <asm/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> @@ -435,6 +436,32 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt, return ret; } +/* + * To avoid a separate RDMA READ just for a handful of zero bytes, + * RFC 5666 section 3.7 allows the client to omit the XDR zero pad + * in chunk lists. + */ +static void +rdma_fix_xdr_pad(struct xdr_buf *buf) +{ + unsigned int page_len = buf->page_len; + unsigned int size = (XDR_QUADLEN(page_len) << 2) - page_len; + unsigned int offset, pg_no; + char *p; + + if (size == 0) + return; + + pg_no = page_len >> PAGE_SHIFT; + offset = page_len & ~PAGE_MASK; + p = page_address(buf->pages[pg_no]); + memset(p + offset, 0, size); + + buf->page_len += size; + buf->buflen += size; + buf->len += size; +} + static int rdma_read_complete(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head) { @@ -449,6 +476,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_pages[page_no] = head->pages[page_no]; } /* Point rq_arg.pages past header */ + rdma_fix_xdr_pad(&head->arg); rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; rqstp->rq_arg.page_len = head->arg.page_len; rqstp->rq_arg.page_base = head->arg.page_base; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 49fd21a5c215..9f1b50689c0f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -192,6 +192,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, xdr_sge_no++; BUG_ON(xdr_sge_no > vec->count); bc -= sge_bytes; + if (sge_no == xprt->sc_max_sge) + break; } /* Prepare WRITE WR */ @@ -209,7 +211,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, atomic_inc(&rdma_stat_write); if (svc_rdma_send(xprt, &write_wr)) goto err; - return 0; + return write_len - bc; err: svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 0); @@ -225,7 +227,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, { u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; int write_len; - int max_write; u32 xdr_off; int chunk_off; int chunk_no; @@ -239,8 +240,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[1]; - max_write = xprt->sc_max_sge * PAGE_SIZE; - /* Write chunks start at the pagelist */ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; xfer_len && chunk_no < arg_ary->wc_nchunks; @@ -260,23 +259,21 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, write_len); chunk_off = 0; while (write_len) { - int this_write; - this_write = min(write_len, max_write); ret = send_write(xprt, rqstp, ntohl(arg_ch->rs_handle), rs_offset + chunk_off, xdr_off, - this_write, + write_len, vec); - if (ret) { + if (ret <= 0) { dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", ret); return -EIO; } - chunk_off += this_write; - xdr_off += this_write; - xfer_len -= this_write; - write_len -= this_write; + chunk_off += ret; + xdr_off += ret; + xfer_len -= ret; + write_len -= ret; } } /* Update the req with the number of chunks actually used */ @@ -293,7 +290,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, { u32 xfer_len = rqstp->rq_res.len; int write_len; - int max_write; u32 xdr_off; int chunk_no; int chunk_off; @@ -311,8 +307,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, res_ary = (struct rpcrdma_write_array *) &rdma_resp->rm_body.rm_chunks[2]; - max_write = xprt->sc_max_sge * PAGE_SIZE; - /* xdr offset starts at RPC message */ nchunks = ntohl(arg_ary->wc_nchunks); for (xdr_off = 0, chunk_no = 0; @@ -330,24 +324,21 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, write_len); chunk_off = 0; while (write_len) { - int this_write; - - this_write = min(write_len, max_write); ret = send_write(xprt, rqstp, ntohl(ch->rs_handle), rs_offset + chunk_off, xdr_off, - this_write, + write_len, vec); - if (ret) { + if (ret <= 0) { dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", ret); return -EIO; } - chunk_off += this_write; - xdr_off += this_write; - xfer_len -= this_write; - write_len -= this_write; + chunk_off += ret; + xdr_off += ret; + xfer_len -= ret; + write_len -= ret; } } /* Update the req with the number of chunks actually used */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index e7323fbbd348..4e618808bc98 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -91,7 +91,8 @@ struct svc_xprt_class svc_rdma_class = { .xcl_name = "rdma", .xcl_owner = THIS_MODULE, .xcl_ops = &svc_rdma_ops, - .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA, + .xcl_ident = XPRT_TRANSPORT_RDMA, }; struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) @@ -942,23 +943,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); if (ret) { - /* - * XXX: This is a hack. We need a xx_request_qp interface - * that will adjust the qp_attr's with a best-effort - * number - */ - qp_attr.cap.max_send_sge -= 2; - qp_attr.cap.max_recv_sge -= 2; - ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, - &qp_attr); - if (ret) { - dprintk("svcrdma: failed to create QP, ret=%d\n", ret); - goto errout; - } - newxprt->sc_max_sge = qp_attr.cap.max_send_sge; - newxprt->sc_max_sge = qp_attr.cap.max_recv_sge; - newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; - newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; + dprintk("svcrdma: failed to create QP, ret=%d\n", ret); + goto errout; } newxprt->sc_qp = newxprt->sc_cm_id->qp; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 66f91f0d071a..6a4615dd0261 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -205,7 +205,6 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpc_xprt *xprt = &r_xprt->xprt; int rc = 0; - current->flags |= PF_FSTRANS; xprt_clear_connected(xprt); dprintk("RPC: %s: %sconnect\n", __func__, @@ -216,7 +215,6 @@ xprt_rdma_connect_worker(struct work_struct *work) dprintk("RPC: %s: exit\n", __func__); xprt_clear_connecting(xprt); - current->flags &= ~PF_FSTRANS; } /* @@ -296,7 +294,6 @@ xprt_setup_rdma(struct xprt_create *args) xprt->resvport = 0; /* privileged port not needed */ xprt->tsh_size = 0; /* RPC-RDMA handles framing */ - xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE; xprt->ops = &xprt_rdma_procs; /* @@ -382,6 +379,9 @@ xprt_setup_rdma(struct xprt_create *args) new_ep->rep_xprt = xprt; xprt_rdma_format_addresses(xprt); + xprt->max_payload = rpcrdma_max_payload(new_xprt); + dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", + __func__, xprt->max_payload); if (!try_module_get(THIS_MODULE)) goto out4; @@ -412,7 +412,7 @@ xprt_rdma_close(struct rpc_xprt *xprt) if (r_xprt->rx_ep.rep_connected > 0) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); - (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); } static void @@ -595,13 +595,14 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int rc; + int rc = 0; - if (req->rl_niovs == 0) { + if (req->rl_niovs == 0) rc = rpcrdma_marshal_req(rqst); - if (rc < 0) - goto failed_marshal; - } + else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + rc = rpcrdma_marshal_chunks(rqst, 0); + if (rc < 0) + goto failed_marshal; if (req->rl_reply == NULL) /* e.g. reconnection */ rpcrdma_recv_buffer_get(req); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 13dbd1c389ff..61c41298b4ea 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -61,6 +61,8 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); + /* * internal functions */ @@ -103,17 +105,6 @@ rpcrdma_run_tasklet(unsigned long data) static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); -static inline void -rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep) -{ - unsigned long flags; - - spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); - list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g); - spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); - tasklet_schedule(&rpcrdma_tasklet_g); -} - static void rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { @@ -153,12 +144,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) if (wc->wr_id == 0ULL) return; if (wc->status != IB_WC_SUCCESS) - return; - - if (wc->opcode == IB_WC_FAST_REG_MR) - frmr->r.frmr.state = FRMR_IS_VALID; - else if (wc->opcode == IB_WC_LOCAL_INV) - frmr->r.frmr.state = FRMR_IS_INVALID; + frmr->r.frmr.fr_state = FRMR_IS_STALE; } static int @@ -217,7 +203,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) } static void -rpcrdma_recvcq_process_wc(struct ib_wc *wc) +rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) { struct rpcrdma_rep *rep = (struct rpcrdma_rep *)(unsigned long)wc->wr_id; @@ -248,28 +234,38 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc) } out_schedule: - rpcrdma_schedule_tasklet(rep); + list_add_tail(&rep->rr_list, sched_list); } static int rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) { + struct list_head sched_list; struct ib_wc *wcs; int budget, count, rc; + unsigned long flags; + INIT_LIST_HEAD(&sched_list); budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; do { wcs = ep->rep_recv_wcs; rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); if (rc <= 0) - return rc; + goto out_schedule; count = rc; while (count-- > 0) - rpcrdma_recvcq_process_wc(wcs++); + rpcrdma_recvcq_process_wc(wcs++, &sched_list); } while (rc == RPCRDMA_POLLSIZE && --budget); - return 0; + rc = 0; + +out_schedule: + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + list_splice_tail(&sched_list, &rpcrdma_tasklets_g); + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + tasklet_schedule(&rpcrdma_tasklet_g); + return rc; } /* @@ -310,6 +306,13 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) rpcrdma_recvcq_poll(cq, ep); } +static void +rpcrdma_flush_cqs(struct rpcrdma_ep *ep) +{ + rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep); + rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep); +} + #ifdef RPC_DEBUG static const char * const conn[] = { "address resolved", @@ -323,8 +326,16 @@ static const char * const conn[] = { "rejected", "established", "disconnected", - "device removal" + "device removal", + "multicast join", + "multicast error", + "address change", + "timewait exit", }; + +#define CONNECTION_MSG(status) \ + ((status) < ARRAY_SIZE(conn) ? \ + conn[(status)] : "unrecognized connection error") #endif static int @@ -382,23 +393,18 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DEVICE_REMOVAL: connstate = -ENODEV; connected: - dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n", - __func__, - (event->event <= 11) ? conn[event->event] : - "unknown connection error", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), - ep, event->event); atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); dprintk("RPC: %s: %sconnected\n", __func__, connstate > 0 ? "" : "dis"); ep->rep_connected = connstate; ep->rep_func(ep); wake_up_all(&ep->rep_connect_wait); - break; + /*FALLTHROUGH*/ default: - dprintk("RPC: %s: unexpected CM event %d\n", - __func__, event->event); + dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", + __func__, &addr->sin_addr.s_addr, + ntohs(addr->sin_port), ep, + CONNECTION_MSG(event->event)); break; } @@ -558,12 +564,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (!ia->ri_id->device->alloc_fmr) { dprintk("RPC: %s: MTHCAFMR registration " "not supported by HCA\n", __func__); -#if RPCRDMA_PERSISTENT_REGISTRATION memreg = RPCRDMA_ALLPHYSICAL; -#else - rc = -ENOMEM; - goto out2; -#endif } } @@ -578,20 +579,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) switch (memreg) { case RPCRDMA_FRMR: break; -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: mem_priv = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; goto register_setup; -#endif case RPCRDMA_MTHCAFMR: if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; -#if RPCRDMA_PERSISTENT_REGISTRATION register_setup: -#endif ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " @@ -613,6 +610,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; + rwlock_init(&ia->ri_qplock); return 0; out2: rdma_destroy_id(ia->ri_id); @@ -826,10 +824,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) cancel_delayed_work_sync(&ep->rep_connect_worker); if (ia->ri_id->qp) { - rc = rpcrdma_ep_disconnect(ep, ia); - if (rc) - dprintk("RPC: %s: rpcrdma_ep_disconnect" - " returned %i\n", __func__, rc); + rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } @@ -859,7 +854,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) int rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - struct rdma_cm_id *id; + struct rdma_cm_id *id, *old; int rc = 0; int retry_count = 0; @@ -867,13 +862,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rpcrdma_xprt *xprt; retry: dprintk("RPC: %s: reconnecting...\n", __func__); - rc = rpcrdma_ep_disconnect(ep, ia); - if (rc && rc != -ENOTCONN) - dprintk("RPC: %s: rpcrdma_ep_disconnect" - " status %i\n", __func__, rc); - rpcrdma_clean_cq(ep->rep_attr.recv_cq); - rpcrdma_clean_cq(ep->rep_attr.send_cq); + rpcrdma_ep_disconnect(ep, ia); + rpcrdma_flush_cqs(ep); + + if (ia->ri_memreg_strategy == RPCRDMA_FRMR) + rpcrdma_reset_frmrs(ia); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, @@ -905,9 +899,14 @@ retry: rc = -ENETUNREACH; goto out; } - rdma_destroy_qp(ia->ri_id); - rdma_destroy_id(ia->ri_id); + + write_lock(&ia->ri_qplock); + old = ia->ri_id; ia->ri_id = id; + write_unlock(&ia->ri_qplock); + + rdma_destroy_qp(old); + rdma_destroy_id(old); } else { dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); @@ -974,13 +973,12 @@ out: * This call is not reentrant, and must not be made in parallel * on the same endpoint. */ -int +void rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; - rpcrdma_clean_cq(ep->rep_attr.recv_cq); - rpcrdma_clean_cq(ep->rep_attr.send_cq); + rpcrdma_flush_cqs(ep); rc = rdma_disconnect(ia->ri_id); if (!rc) { /* returns without wait if not connected */ @@ -992,12 +990,93 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); ep->rep_connected = rc; } +} + +static int +rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) +{ + int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; + struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_DATA_SEGS, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + struct rpcrdma_mw *r; + int i, rc; + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + + r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr)) { + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr failed %i\n", + __func__, rc); + goto out_free; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + return 0; + +out_free: + kfree(r); + return rc; +} + +static int +rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_frmr *f; + struct rpcrdma_mw *r; + int i, rc; + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + f = &r->r.frmr; + + f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_mr)) { + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr " + "failed %i\n", __func__, rc); + goto out_free; + } + + f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_pgl)) { + rc = PTR_ERR(f->fr_pgl); + dprintk("RPC: %s: ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + + ib_dereg_mr(f->fr_mr); + goto out_free; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + + return 0; + +out_free: + kfree(r); return rc; } -/* - * Initialize buffer memory - */ int rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) @@ -1005,7 +1084,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, char *p; size_t len, rlen, wlen; int i, rc; - struct rpcrdma_mw *r; buf->rb_max_requests = cdata->max_requests; spin_lock_init(&buf->rb_lock); @@ -1016,28 +1094,12 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies * 4. padding, if any - * 5. mw's, fmr's or frmr's, if any * Send/recv buffers in req/rep need to be registered */ - len = buf->rb_max_requests * (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); len += cdata->padding; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * - sizeof(struct rpcrdma_mw); - break; - case RPCRDMA_MTHCAFMR: - /* TBD we are perhaps overallocating here */ - len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * - sizeof(struct rpcrdma_mw); - break; - default: - break; - } - /* allocate 1, 4 and 5 in one shot */ p = kzalloc(len, GFP_KERNEL); if (p == NULL) { dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", @@ -1064,51 +1126,17 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, p += cdata->padding; INIT_LIST_HEAD(&buf->rb_mws); - r = (struct rpcrdma_mw *)p; + INIT_LIST_HEAD(&buf->rb_all); switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: - for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_mr)) { - rc = PTR_ERR(r->r.frmr.fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr" - " failed %i\n", __func__, rc); - goto out; - } - r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( - ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_pgl)) { - rc = PTR_ERR(r->r.frmr.fr_pgl); - dprintk("RPC: %s: " - "ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(r->r.frmr.fr_mr); - goto out; - } - list_add(&r->mw_list, &buf->rb_mws); - ++r; - } + rc = rpcrdma_init_frmrs(ia, buf); + if (rc) + goto out; break; case RPCRDMA_MTHCAFMR: - /* TBD we are perhaps overallocating here */ - for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { - static struct ib_fmr_attr fa = - { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; - r->r.fmr = ib_alloc_fmr(ia->ri_pd, - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, - &fa); - if (IS_ERR(r->r.fmr)) { - rc = PTR_ERR(r->r.fmr); - dprintk("RPC: %s: ib_alloc_fmr" - " failed %i\n", __func__, rc); - goto out; - } - list_add(&r->mw_list, &buf->rb_mws); - ++r; - } + rc = rpcrdma_init_fmrs(ia, buf); + if (rc) + goto out; break; default: break; @@ -1176,24 +1204,57 @@ out: return rc; } -/* - * Unregister and destroy buffer memory. Need to deal with - * partial initialization, so it's callable from failed create. - * Must be called before destroying endpoint, as registrations - * reference it. - */ +static void +rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + list_del(&r->mw_list); + + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", + __func__, rc); + + kfree(r); + } +} + +static void +rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + list_del(&r->mw_list); + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + + kfree(r); + } +} + void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - int rc, i; struct rpcrdma_ia *ia = rdmab_to_ia(buf); - struct rpcrdma_mw *r; + int i; /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) * 2. send mr memory (mr free, then kfree) - * 3. padding (if any) [moved to rpcrdma_ep_destroy] - * 4. arrays + * 3. MWs */ dprintk("RPC: %s: entering\n", __func__); @@ -1212,34 +1273,217 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } } + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rpcrdma_destroy_frmrs(buf); + break; + case RPCRDMA_MTHCAFMR: + rpcrdma_destroy_fmrs(buf); + break; + default: + break; + } + + kfree(buf->rb_pool); +} + +/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in + * an unusable state. Find FRMRs in this state and dereg / reg + * each. FRMRs that are VALID and attached to an rpcrdma_req are + * also torn down. + * + * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_frmr_external(). + */ +static void +rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = + container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct list_head *pos; + struct rpcrdma_mw *r; + int rc; + + list_for_each(pos, &buf->rb_all) { + r = list_entry(pos, struct rpcrdma_mw, mw_all); + + if (r->r.frmr.fr_state == FRMR_IS_INVALID) + continue; + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + continue; + } + r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( + ia->ri_id->device, + ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + + ib_dereg_mr(r->r.frmr.fr_mr); + continue; + } + r->r.frmr.fr_state = FRMR_IS_INVALID; + } +} + +/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving + * some req segments uninitialized. + */ +static void +rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf) +{ + if (*mw) { + list_add_tail(&(*mw)->mw_list, &buf->rb_mws); + *mw = NULL; + } +} + +/* Cycle mw's back in reverse order, and "spin" them. + * This delays and scrambles reuse as much as possible. + */ +static void +rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mr_seg *seg = req->rl_segments; + struct rpcrdma_mr_seg *seg1 = seg; + int i; + + for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) + rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf); + rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf); +} + +static void +rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + buf->rb_send_bufs[--buf->rb_send_index] = req; + req->rl_niovs = 0; + if (req->rl_reply) { + buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply; + req->rl_reply->rr_func = NULL; + req->rl_reply = NULL; + } +} + +/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). + * Redo only the ib_post_send(). + */ +static void +rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = + container_of(ia, struct rpcrdma_xprt, rx_ia); + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + dprintk("RPC: %s: FRMR %p is stale\n", __func__, r); + + /* When this FRMR is re-inserted into rb_mws, it is no longer stale */ + r->r.frmr.fr_state = FRMR_IS_INVALID; + + memset(&invalidate_wr, 0, sizeof(invalidate_wr)); + invalidate_wr.wr_id = (unsigned long)(void *)r; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + dprintk("RPC: %s: frmr %p invalidating rkey %08x\n", + __func__, r, r->r.frmr.fr_mr->rkey); + + read_lock(&ia->ri_qplock); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) { + /* Force rpcrdma_buffer_get() to retry */ + r->r.frmr.fr_state = FRMR_IS_STALE; + dprintk("RPC: %s: ib_post_send failed, %i\n", + __func__, rc); + } +} + +static void +rpcrdma_retry_flushed_linv(struct list_head *stale, + struct rpcrdma_buffer *buf) +{ + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct list_head *pos; + struct rpcrdma_mw *r; + unsigned long flags; + + list_for_each(pos, stale) { + r = list_entry(pos, struct rpcrdma_mw, mw_list); + rpcrdma_retry_local_inv(r, ia); + } + + spin_lock_irqsave(&buf->rb_lock, flags); + list_splice_tail(stale, &buf->rb_mws); + spin_unlock_irqrestore(&buf->rb_lock, flags); +} + +static struct rpcrdma_req * +rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf, + struct list_head *stale) +{ + struct rpcrdma_mw *r; + int i; + + i = RPCRDMA_MAX_SEGS - 1; while (!list_empty(&buf->rb_mws)) { r = list_entry(buf->rb_mws.next, - struct rpcrdma_mw, mw_list); + struct rpcrdma_mw, mw_list); list_del(&r->mw_list); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s:" - " ib_dereg_mr" - " failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - break; - case RPCRDMA_MTHCAFMR: - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s:" - " ib_dealloc_fmr" - " failed %i\n", - __func__, rc); - break; - default: - break; + if (r->r.frmr.fr_state == FRMR_IS_STALE) { + list_add(&r->mw_list, stale); + continue; } + req->rl_segments[i].mr_chunk.rl_mw = r; + if (unlikely(i-- == 0)) + return req; /* Success */ } - kfree(buf->rb_pool); + /* Not enough entries on rb_mws for this req */ + rpcrdma_buffer_put_sendbuf(req, buf); + rpcrdma_buffer_put_mrs(req, buf); + return NULL; +} + +static struct rpcrdma_req * +rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int i; + + i = RPCRDMA_MAX_SEGS - 1; + while (!list_empty(&buf->rb_mws)) { + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + req->rl_segments[i].mr_chunk.rl_mw = r; + if (unlikely(i-- == 0)) + return req; /* Success */ + } + + /* Not enough entries on rb_mws for this req */ + rpcrdma_buffer_put_sendbuf(req, buf); + rpcrdma_buffer_put_mrs(req, buf); + return NULL; } /* @@ -1254,10 +1498,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { + struct rpcrdma_ia *ia = rdmab_to_ia(buffers); + struct list_head stale; struct rpcrdma_req *req; unsigned long flags; - int i; - struct rpcrdma_mw *r; spin_lock_irqsave(&buffers->rb_lock, flags); if (buffers->rb_send_index == buffers->rb_max_requests) { @@ -1277,16 +1521,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; } buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; - if (!list_empty(&buffers->rb_mws)) { - i = RPCRDMA_MAX_SEGS - 1; - do { - r = list_entry(buffers->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_list); - req->rl_segments[i].mr_chunk.rl_mw = r; - } while (--i >= 0); + + INIT_LIST_HEAD(&stale); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + req = rpcrdma_buffer_get_frmrs(req, buffers, &stale); + break; + case RPCRDMA_MTHCAFMR: + req = rpcrdma_buffer_get_fmrs(req, buffers); + break; + default: + break; } spin_unlock_irqrestore(&buffers->rb_lock, flags); + if (!list_empty(&stale)) + rpcrdma_retry_flushed_linv(&stale, buffers); return req; } @@ -1299,34 +1548,14 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) { struct rpcrdma_buffer *buffers = req->rl_buffer; struct rpcrdma_ia *ia = rdmab_to_ia(buffers); - int i; unsigned long flags; spin_lock_irqsave(&buffers->rb_lock, flags); - buffers->rb_send_bufs[--buffers->rb_send_index] = req; - req->rl_niovs = 0; - if (req->rl_reply) { - buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; - req->rl_reply->rr_func = NULL; - req->rl_reply = NULL; - } + rpcrdma_buffer_put_sendbuf(req, buffers); switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: - /* - * Cycle mw's back in reverse order, and "spin" them. - * This delays and scrambles reuse as much as possible. - */ - i = 1; - do { - struct rpcrdma_mw **mw; - mw = &req->rl_segments[i].mr_chunk.rl_mw; - list_add_tail(&(*mw)->mw_list, &buffers->rb_mws); - *mw = NULL; - } while (++i < RPCRDMA_MAX_SEGS); - list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list, - &buffers->rb_mws); - req->rl_segments[0].mr_chunk.rl_mw = NULL; + rpcrdma_buffer_put_mrs(req, buffers); break; default: break; @@ -1388,6 +1617,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, */ iov->addr = ib_dma_map_single(ia->ri_id->device, va, len, DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(ia->ri_id->device, iov->addr)) + return -ENOMEM; + iov->length = len; if (ia->ri_have_dma_lkey) { @@ -1483,8 +1715,10 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_mr_seg *seg1 = seg; - struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr; - + struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; + struct ib_mr *mr = frmr->fr_mr; + struct ib_send_wr fastreg_wr, *bad_wr; u8 key; int len, pageoff; int i, rc; @@ -1502,8 +1736,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, rpcrdma_map_one(ia, seg, writing); pa = seg->mr_dma; for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - seg1->mr_chunk.rl_mw->r.frmr.fr_pgl-> - page_list[page_no++] = pa; + frmr->fr_pgl->page_list[page_no++] = pa; pa += PAGE_SIZE; } len += seg->mr_len; @@ -1515,65 +1748,51 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, break; } dprintk("RPC: %s: Using frmr %p to map %d segments\n", - __func__, seg1->mr_chunk.rl_mw, i); - - if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) { - dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", - __func__, - seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey); - /* Invalidate before using. */ - memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; - invalidate_wr.next = &frmr_wr; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.send_flags = IB_SEND_SIGNALED; - invalidate_wr.ex.invalidate_rkey = - seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - post_wr = &invalidate_wr; - } else - post_wr = &frmr_wr; - - /* Prepare FRMR WR */ - memset(&frmr_wr, 0, sizeof frmr_wr); - frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; - frmr_wr.opcode = IB_WR_FAST_REG_MR; - frmr_wr.send_flags = IB_SEND_SIGNALED; - frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; - frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; - frmr_wr.wr.fast_reg.page_list_len = page_no; - frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - if (frmr_wr.wr.fast_reg.length < len) { - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - return -EIO; + __func__, mw, i); + + frmr->fr_state = FRMR_IS_VALID; + + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = (unsigned long)(void *)mw; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; + fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; + fastreg_wr.wr.fast_reg.page_list_len = page_no; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; + if (fastreg_wr.wr.fast_reg.length < len) { + rc = -EIO; + goto out_err; } /* Bump the key */ - key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); - ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); - frmr_wr.wr.fast_reg.access_flags = (writing ? + fastreg_wr.wr.fast_reg.access_flags = (writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ); - frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + fastreg_wr.wr.fast_reg.rkey = mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); - + rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); if (rc) { dprintk("RPC: %s: failed ib_post_send for register," " status %i\n", __func__, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); + ib_update_fast_reg_key(mr, --key); + goto out_err; } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + seg1->mr_rkey = mr->rkey; seg1->mr_base = seg1->mr_dma + pageoff; seg1->mr_nsegs = i; seg1->mr_len = len; } *nsegs = i; + return 0; +out_err: + frmr->fr_state = FRMR_IS_INVALID; + while (i--) + rpcrdma_unmap_one(ia, --seg); return rc; } @@ -1585,20 +1804,25 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, struct ib_send_wr invalidate_wr, *bad_wr; int rc; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); + seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; memset(&invalidate_wr, 0, sizeof invalidate_wr); invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.send_flags = IB_SEND_SIGNALED; invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); + read_lock(&ia->ri_qplock); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); - if (rc) + read_unlock(&ia->ri_qplock); + if (rc) { + /* Force rpcrdma_buffer_get() to retry */ + seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE; dprintk("RPC: %s: failed ib_post_send for invalidate," " status %i\n", __func__, rc); + } return rc; } @@ -1656,8 +1880,10 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); rc = ib_unmap_fmr(&l); + read_lock(&ia->ri_qplock); while (seg1->mr_nsegs--) rpcrdma_unmap_one(ia, seg++); + read_unlock(&ia->ri_qplock); if (rc) dprintk("RPC: %s: failed ib_unmap_fmr," " status %i\n", __func__, rc); @@ -1673,7 +1899,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, switch (ia->ri_memreg_strategy) { -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: rpcrdma_map_one(ia, seg, writing); seg->mr_rkey = ia->ri_bind_mem->rkey; @@ -1681,7 +1906,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, seg->mr_nsegs = 1; nsegs = 1; break; -#endif /* Registration using frmr registration */ case RPCRDMA_FRMR: @@ -1711,11 +1935,11 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, switch (ia->ri_memreg_strategy) { -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: + read_lock(&ia->ri_qplock); rpcrdma_unmap_one(ia, seg); + read_unlock(&ia->ri_qplock); break; -#endif case RPCRDMA_FRMR: rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); @@ -1809,3 +2033,44 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, rc); return rc; } + +/* Physical mapping means one Read/Write list entry per-page. + * All list entries must fit within an inline buffer + * + * NB: The server must return a Write list for NFS READ, + * which has the same constraint. Factor in the inline + * rsize as well. + */ +static size_t +rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + unsigned int inline_size, pages; + + inline_size = min_t(unsigned int, + cdata->inline_wsize, cdata->inline_rsize); + inline_size -= RPCRDMA_HDRLEN_MIN; + pages = inline_size / sizeof(struct rpcrdma_segment); + return pages << PAGE_SHIFT; +} + +static size_t +rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) +{ + return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; +} + +size_t +rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) +{ + size_t result; + + switch (r_xprt->rx_ia.ri_memreg_strategy) { + case RPCRDMA_ALLPHYSICAL: + result = rpcrdma_physical_max_payload(r_xprt); + break; + default: + result = rpcrdma_mr_max_payload(r_xprt); + } + return result; +} diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 89e7cd479705..ac7fc9a31342 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -51,6 +51,7 @@ #include <linux/sunrpc/clnt.h> /* rpc_xprt */ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ +#include <linux/sunrpc/svc.h> /* RPCSVC_MAXPAYLOAD */ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ @@ -59,6 +60,7 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { + rwlock_t ri_qplock; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct ib_mr *ri_bind_mem; @@ -98,6 +100,14 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -137,6 +147,40 @@ struct rpcrdma_rep { }; /* + * struct rpcrdma_mw - external memory region metadata + * + * An external memory region is any buffer or page that is registered + * on the fly (ie, not pre-registered). + * + * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During + * call_allocate, rpcrdma_buffer_get() assigns one to each segment in + * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep + * track of registration metadata while each RPC is pending. + * rpcrdma_deregister_external() uses this metadata to unmap and + * release these resources when an RPC is complete. + */ +enum rpcrdma_frmr_state { + FRMR_IS_INVALID, /* ready to be used */ + FRMR_IS_VALID, /* in use */ + FRMR_IS_STALE, /* failed completion */ +}; + +struct rpcrdma_frmr { + struct ib_fast_reg_page_list *fr_pgl; + struct ib_mr *fr_mr; + enum rpcrdma_frmr_state fr_state; +}; + +struct rpcrdma_mw { + union { + struct ib_fmr *fmr; + struct rpcrdma_frmr frmr; + } r; + struct list_head mw_list; + struct list_head mw_all; +}; + +/* * struct rpcrdma_req -- structure central to the request/reply sequence. * * N of these are associated with a transport instance, and stored in @@ -163,17 +207,7 @@ struct rpcrdma_rep { struct rpcrdma_mr_seg { /* chunk descriptors */ union { /* chunk memory handles */ struct ib_mr *rl_mr; /* if registered directly */ - struct rpcrdma_mw { /* if registered from region */ - union { - struct ib_fmr *fmr; - struct { - struct ib_fast_reg_page_list *fr_pgl; - struct ib_mr *fr_mr; - enum { FRMR_IS_INVALID, FRMR_IS_VALID } state; - } frmr; - } r; - struct list_head mw_list; - } *rl_mw; + struct rpcrdma_mw *rl_mw; /* if registered from region */ } mr_chunk; u64 mr_base; /* registration result */ u32 mr_rkey; /* registration result */ @@ -191,6 +225,7 @@ struct rpcrdma_req { unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ unsigned int rl_connect_cookie; /* retry detection */ + enum rpcrdma_chunktype rl_rtype, rl_wtype; struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ @@ -214,6 +249,7 @@ struct rpcrdma_buffer { atomic_t rb_credits; /* most recent server credits */ int rb_max_requests;/* client max requests */ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ + struct list_head rb_all; int rb_send_index; struct rpcrdma_req **rb_send_bufs; int rb_recv_index; @@ -306,7 +342,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_req *); @@ -346,7 +382,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ +ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t); int rpcrdma_marshal_req(struct rpc_rqst *); +size_t rpcrdma_max_payload(struct rpcrdma_xprt *); /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; @@ -355,4 +393,10 @@ extern struct kmem_cache *svc_rdma_ctxt_cachep; /* Workqueue created in svc_rdma.c */ extern struct workqueue_struct *svc_rdma_wq; +#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT) +#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +#else +#define RPCSVC_MAXPAYLOAD_RDMA (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT) +#endif + #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index be8bbd5d65ec..3b305ab17afe 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -399,13 +399,13 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, return kernel_sendmsg(sock, &msg, NULL, 0, 0); } -static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy) +static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy, int *sent_p) { ssize_t (*do_sendpage)(struct socket *sock, struct page *page, int offset, size_t size, int flags); struct page **ppage; unsigned int remainder; - int err, sent = 0; + int err; remainder = xdr->page_len - base; base += xdr->page_base; @@ -424,15 +424,15 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i err = do_sendpage(sock, *ppage, base, len, flags); if (remainder == 0 || err != len) break; - sent += err; + *sent_p += err; ppage++; base = 0; } - if (sent == 0) - return err; - if (err > 0) - sent += err; - return sent; + if (err > 0) { + *sent_p += err; + err = 0; + } + return err; } /** @@ -443,12 +443,14 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i * @xdr: buffer containing this request * @base: starting position in the buffer * @zerocopy: true if it is safe to use sendpage() + * @sent_p: return the total number of bytes successfully queued for sending * */ -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy) +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy, int *sent_p) { unsigned int remainder = xdr->len - base; - int err, sent = 0; + int err = 0; + int sent = 0; if (unlikely(!sock)) return -ENOTSOCK; @@ -465,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0); if (remainder == 0 || err != len) goto out; - sent += err; + *sent_p += err; base = 0; } else base -= xdr->head[0].iov_len; @@ -473,23 +475,23 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, if (base < xdr->page_len) { unsigned int len = xdr->page_len - base; remainder -= len; - err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy); - if (remainder == 0 || err != len) + err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy, &sent); + *sent_p += sent; + if (remainder == 0 || sent != len) goto out; - sent += err; base = 0; } else base -= xdr->page_len; if (base >= xdr->tail[0].iov_len) - return sent; + return 0; err = xs_send_kvec(sock, NULL, 0, &xdr->tail[0], base, 0); out: - if (sent == 0) - return err; - if (err > 0) - sent += err; - return sent; + if (err > 0) { + *sent_p += err; + err = 0; + } + return err; } static void xs_nospace_callback(struct rpc_task *task) @@ -573,19 +575,20 @@ static int xs_local_send_request(struct rpc_task *task) container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; int status; + int sent = 0; xs_encode_stream_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); - status = xs_sendpages(transport->sock, NULL, 0, - xdr, req->rq_bytes_sent, true); + status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent, + true, &sent); dprintk("RPC: %s(%u) = %d\n", __func__, xdr->len - req->rq_bytes_sent, status); - if (likely(status >= 0)) { - req->rq_bytes_sent += status; - req->rq_xmit_bytes_sent += status; + if (likely(sent > 0) || status == 0) { + req->rq_bytes_sent += sent; + req->rq_xmit_bytes_sent += sent; if (likely(req->rq_bytes_sent >= req->rq_slen)) { req->rq_bytes_sent = 0; return 0; @@ -594,6 +597,7 @@ static int xs_local_send_request(struct rpc_task *task) } switch (status) { + case -ENOBUFS: case -EAGAIN: status = xs_nospace(task); break; @@ -625,6 +629,7 @@ static int xs_udp_send_request(struct rpc_task *task) struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; + int sent = 0; int status; xs_pktdump("packet data:", @@ -633,22 +638,25 @@ static int xs_udp_send_request(struct rpc_task *task) if (!xprt_bound(xprt)) return -ENOTCONN; - status = xs_sendpages(transport->sock, - xs_addr(xprt), - xprt->addrlen, xdr, - req->rq_bytes_sent, true); + status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, + xdr, req->rq_bytes_sent, true, &sent); dprintk("RPC: xs_udp_send_request(%u) = %d\n", xdr->len - req->rq_bytes_sent, status); - if (status >= 0) { - req->rq_xmit_bytes_sent += status; - if (status >= req->rq_slen) + /* firewall is blocking us, don't return -EAGAIN or we end up looping */ + if (status == -EPERM) + goto process_status; + + if (sent > 0 || status == 0) { + req->rq_xmit_bytes_sent += sent; + if (sent >= req->rq_slen) return 0; /* Still some bytes left; set up for a retry later. */ status = -EAGAIN; } +process_status: switch (status) { case -ENOTSOCK: status = -ENOTCONN; @@ -661,8 +669,10 @@ static int xs_udp_send_request(struct rpc_task *task) dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); case -ENETUNREACH: + case -ENOBUFS: case -EPIPE: case -ECONNREFUSED: + case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); @@ -711,6 +721,7 @@ static int xs_tcp_send_request(struct rpc_task *task) struct xdr_buf *xdr = &req->rq_snd_buf; bool zerocopy = true; int status; + int sent; xs_encode_stream_record_marker(&req->rq_snd_buf); @@ -728,26 +739,26 @@ static int xs_tcp_send_request(struct rpc_task *task) * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ while (1) { - status = xs_sendpages(transport->sock, - NULL, 0, xdr, req->rq_bytes_sent, - zerocopy); + sent = 0; + status = xs_sendpages(transport->sock, NULL, 0, xdr, + req->rq_bytes_sent, zerocopy, &sent); dprintk("RPC: xs_tcp_send_request(%u) = %d\n", xdr->len - req->rq_bytes_sent, status); - if (unlikely(status < 0)) + if (unlikely(sent == 0 && status < 0)) break; /* If we've sent the entire packet, immediately * reset the count of bytes sent. */ - req->rq_bytes_sent += status; - req->rq_xmit_bytes_sent += status; + req->rq_bytes_sent += sent; + req->rq_xmit_bytes_sent += sent; if (likely(req->rq_bytes_sent >= req->rq_slen)) { req->rq_bytes_sent = 0; return 0; } - if (status != 0) + if (sent != 0) continue; status = -EAGAIN; break; @@ -758,6 +769,7 @@ static int xs_tcp_send_request(struct rpc_task *task) status = -ENOTCONN; /* Should we call xs_close() here? */ break; + case -ENOBUFS: case -EAGAIN: status = xs_nospace(task); break; @@ -842,6 +854,8 @@ static void xs_error_report(struct sock *sk) dprintk("RPC: xs_error_report client %p, error=%d...\n", xprt, -err); trace_rpc_socket_error(xprt, sk->sk_socket, err); + if (test_bit(XPRT_CONNECTION_REUSE, &xprt->state)) + goto out; xprt_wake_pending_tasks(xprt, err); out: read_unlock_bh(&sk->sk_callback_lock); @@ -1743,13 +1757,29 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) unsigned short port = xs_get_srcport(transport); unsigned short last; + /* + * If we are asking for any ephemeral port (i.e. port == 0 && + * transport->xprt.resvport == 0), don't bind. Let the local + * port selection happen implicitly when the socket is used + * (for example at connect time). + * + * This ensures that we can continue to establish TCP + * connections even when all local ephemeral ports are already + * a part of some TCP connection. This makes no difference + * for UDP sockets, but also doens't harm them. + * + * If we're asking for any reserved port (i.e. port == 0 && + * transport->xprt.resvport == 1) xs_get_srcport above will + * ensure that port is non-zero and we will bind as needed. + */ + if (port == 0) + return 0; + memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { rpc_set_port((struct sockaddr *)&myaddr, port); err = kernel_bind(sock, (struct sockaddr *)&myaddr, transport->xprt.addrlen); - if (port == 0) - break; if (err == 0) { transport->srcport = port; break; @@ -1924,8 +1954,6 @@ static int xs_local_setup_socket(struct sock_xprt *transport) struct socket *sock; int status = -EIO; - current->flags |= PF_FSTRANS; - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); status = __sock_create(xprt->xprt_net, AF_LOCAL, SOCK_STREAM, 0, &sock, 1); @@ -1946,6 +1974,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) dprintk("RPC: xprt %p connected to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); xprt_set_connected(xprt); + case -ENOBUFS: break; case -ENOENT: dprintk("RPC: xprt %p: socket %s does not exist\n", @@ -1964,7 +1993,6 @@ static int xs_local_setup_socket(struct sock_xprt *transport) out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); - current->flags &= ~PF_FSTRANS; return status; } @@ -2067,8 +2095,6 @@ static void xs_udp_setup_socket(struct work_struct *work) struct socket *sock = transport->sock; int status = -EIO; - current->flags |= PF_FSTRANS; - /* Start by resetting any existing state */ xs_reset_transport(transport); sock = xs_create_sock(xprt, transport, @@ -2088,7 +2114,6 @@ static void xs_udp_setup_socket(struct work_struct *work) out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); - current->flags &= ~PF_FSTRANS; } /* @@ -2225,8 +2250,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; int status = -EIO; - current->flags |= PF_FSTRANS; - if (!sock) { clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); sock = xs_create_sock(xprt, transport, @@ -2241,7 +2264,9 @@ static void xs_tcp_setup_socket(struct work_struct *work) abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); /* "close" the socket, preserving the local port */ + set_bit(XPRT_CONNECTION_REUSE, &xprt->state); xs_tcp_reuse_connection(transport); + clear_bit(XPRT_CONNECTION_REUSE, &xprt->state); if (abort_and_exit) goto out_eagain; @@ -2272,7 +2297,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EINPROGRESS: case -EALREADY: xprt_clear_connecting(xprt); - current->flags &= ~PF_FSTRANS; return; case -EINVAL: /* Happens, for instance, if the user specified a link @@ -2281,6 +2305,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -ECONNREFUSED: case -ECONNRESET: case -ENETUNREACH: + case -ENOBUFS: /* retry with existing socket, after a delay */ goto out; } @@ -2289,7 +2314,6 @@ out_eagain: out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); - current->flags &= ~PF_FSTRANS; } /** @@ -3054,12 +3078,12 @@ static int param_set_uint_minmax(const char *val, const struct kernel_param *kp, unsigned int min, unsigned int max) { - unsigned long num; + unsigned int num; int ret; if (!val) return -EINVAL; - ret = strict_strtoul(val, 0, &num); + ret = kstrtouint(val, 0, &num); if (ret == -EINVAL || num < min || num > max) return -EINVAL; *((unsigned int *)kp->arg) = num; diff --git a/net/tipc/Makefile b/net/tipc/Makefile index a080c66d819a..b8a13caad59a 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_TIPC) := tipc.o tipc-y += addr.o bcast.o bearer.o config.o \ core.o link.o discover.o msg.o \ name_distr.o subscr.o name_table.o net.o \ - netlink.o node.o node_subscr.o port.o ref.o \ + netlink.o node.o node_subscr.o \ socket.o log.o eth_media.o server.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 55c6c9d3e1ce..b8670bf262e2 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -1,7 +1,7 @@ /* * net/tipc/bcast.c: TIPC broadcast code * - * Copyright (c) 2004-2006, Ericsson AB + * Copyright (c) 2004-2006, 2014, Ericsson AB * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. @@ -37,7 +37,8 @@ #include "core.h" #include "link.h" -#include "port.h" +#include "socket.h" +#include "msg.h" #include "bcast.h" #include "name_distr.h" @@ -138,6 +139,11 @@ static void tipc_bclink_unlock(void) tipc_link_reset_all(node); } +uint tipc_bclink_get_mtu(void) +{ + return MAX_PKT_DEFAULT_MCAST; +} + void tipc_bclink_set_flags(unsigned int flags) { bclink->flags |= flags; @@ -220,6 +226,17 @@ static void bclink_retransmit_pkt(u32 after, u32 to) } /** + * tipc_bclink_wakeup_users - wake up pending users + * + * Called with no locks taken + */ +void tipc_bclink_wakeup_users(void) +{ + while (skb_queue_len(&bclink->link.waiting_sks)) + tipc_sk_rcv(skb_dequeue(&bclink->link.waiting_sks)); +} + +/** * tipc_bclink_acknowledge - handle acknowledgement of broadcast packets * @n_ptr: node that sent acknowledgement info * @acked: broadcast sequence # that has been acknowledged @@ -293,8 +310,9 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) tipc_link_push_queue(bcl); bclink_set_last_sent(); } - if (unlikely(released && !list_empty(&bcl->waiting_ports))) - tipc_link_wakeup_ports(bcl, 0); + if (unlikely(released && !skb_queue_empty(&bcl->waiting_sks))) + n_ptr->action_flags |= TIPC_WAKEUP_BCAST_USERS; + exit: tipc_bclink_unlock(); } @@ -382,30 +400,50 @@ static void bclink_peek_nack(struct tipc_msg *msg) tipc_node_unlock(n_ptr); } -/* - * tipc_bclink_xmit - broadcast a packet to all nodes in cluster +/* tipc_bclink_xmit - broadcast buffer chain to all nodes in cluster + * and to identified node local sockets + * @buf: chain of buffers containing message + * Consumes the buffer chain, except when returning -ELINKCONG + * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE */ int tipc_bclink_xmit(struct sk_buff *buf) { - int res; - - tipc_bclink_lock(); + int rc = 0; + int bc = 0; + struct sk_buff *clbuf; - if (!bclink->bcast_nodes.count) { - res = msg_data_sz(buf_msg(buf)); - kfree_skb(buf); - goto exit; + /* Prepare clone of message for local node */ + clbuf = tipc_msg_reassemble(buf); + if (unlikely(!clbuf)) { + kfree_skb_list(buf); + return -EHOSTUNREACH; } - res = __tipc_link_xmit(bcl, buf); - if (likely(res >= 0)) { - bclink_set_last_sent(); - bcl->stats.queue_sz_counts++; - bcl->stats.accu_queue_sz += bcl->out_queue_size; + /* Broadcast to all other nodes */ + if (likely(bclink)) { + tipc_bclink_lock(); + if (likely(bclink->bcast_nodes.count)) { + rc = __tipc_link_xmit(bcl, buf); + if (likely(!rc)) { + bclink_set_last_sent(); + bcl->stats.queue_sz_counts++; + bcl->stats.accu_queue_sz += bcl->out_queue_size; + } + bc = 1; + } + tipc_bclink_unlock(); } -exit: - tipc_bclink_unlock(); - return res; + + if (unlikely(!bc)) + kfree_skb_list(buf); + + /* Deliver message clone */ + if (likely(!rc)) + tipc_sk_mcast_rcv(clbuf); + else + kfree_skb(clbuf); + + return rc; } /** @@ -443,7 +481,7 @@ void tipc_bclink_rcv(struct sk_buff *buf) struct tipc_node *node; u32 next_in; u32 seqno; - int deferred; + int deferred = 0; /* Screen out unwanted broadcast messages */ @@ -494,7 +532,7 @@ receive: tipc_bclink_unlock(); tipc_node_unlock(node); if (likely(msg_mcast(msg))) - tipc_port_mcast_rcv(buf, NULL); + tipc_sk_mcast_rcv(buf); else kfree_skb(buf); } else if (msg_user(msg) == MSG_BUNDLER) { @@ -573,8 +611,7 @@ receive: node->bclink.deferred_size += deferred; bclink_update_last_sent(node, seqno); buf = NULL; - } else - deferred = 0; + } tipc_bclink_lock(); @@ -611,6 +648,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf, struct tipc_bearer *unused1, struct tipc_media_addr *unused2) { int bp_index; + struct tipc_msg *msg = buf_msg(buf); /* Prepare broadcast link message for reliable transmission, * if first time trying to send it; @@ -618,10 +656,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf, struct tipc_bearer *unused1, * since they are sent in an unreliable manner and don't need it */ if (likely(!msg_non_seq(buf_msg(buf)))) { - struct tipc_msg *msg; - bcbuf_set_acks(buf, bclink->bcast_nodes.count); - msg = buf_msg(buf); msg_set_non_seq(msg, 1); msg_set_mc_netid(msg, tipc_net_id); bcl->stats.sent_info++; @@ -638,12 +673,14 @@ static int tipc_bcbearer_send(struct sk_buff *buf, struct tipc_bearer *unused1, for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) { struct tipc_bearer *p = bcbearer->bpairs[bp_index].primary; struct tipc_bearer *s = bcbearer->bpairs[bp_index].secondary; - struct tipc_bearer *b = p; + struct tipc_bearer *bp[2] = {p, s}; + struct tipc_bearer *b = bp[msg_link_selector(msg)]; struct sk_buff *tbuf; if (!p) break; /* No more bearers to try */ - + if (!b) + b = p; tipc_nmap_diff(&bcbearer->remains, &b->nodes, &bcbearer->remains_new); if (bcbearer->remains_new.count == bcbearer->remains.count) @@ -660,13 +697,6 @@ static int tipc_bcbearer_send(struct sk_buff *buf, struct tipc_bearer *unused1, tipc_bearer_send(b->identity, tbuf, &b->bcast_addr); kfree_skb(tbuf); /* Bearer keeps a clone */ } - - /* Swap bearers for next packet */ - if (s) { - bcbearer->bpairs[bp_index].primary = s; - bcbearer->bpairs[bp_index].secondary = p; - } - if (bcbearer->remains_new.count == 0) break; /* All targets reached */ @@ -821,9 +851,10 @@ int tipc_bclink_init(void) sprintf(bcbearer->media.name, "tipc-broadcast"); spin_lock_init(&bclink->lock); - INIT_LIST_HEAD(&bcl->waiting_ports); + __skb_queue_head_init(&bcl->waiting_sks); bcl->next_out_no = 1; spin_lock_init(&bclink->node.lock); + __skb_queue_head_init(&bclink->node.waiting_sks); bcl->owner = &bclink->node; bcl->max_pkt = MAX_PKT_DEFAULT_MCAST; tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 00330c45df3e..e7b0f85a82bc 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -1,7 +1,7 @@ /* * net/tipc/bcast.h: Include file for TIPC broadcast code * - * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2003-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -89,7 +89,6 @@ void tipc_bclink_add_node(u32 addr); void tipc_bclink_remove_node(u32 addr); struct tipc_node *tipc_bclink_retransmit_to(void); void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked); -int tipc_bclink_xmit(struct sk_buff *buf); void tipc_bclink_rcv(struct sk_buff *buf); u32 tipc_bclink_get_last_sent(void); u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr); @@ -98,5 +97,7 @@ int tipc_bclink_stats(char *stats_buf, const u32 buf_size); int tipc_bclink_reset_stats(void); int tipc_bclink_set_queue_limits(u32 limit); void tipc_bcbearer_sort(struct tipc_node_map *nm_ptr, u32 node, bool action); - +uint tipc_bclink_get_mtu(void); +int tipc_bclink_xmit(struct sk_buff *buf); +void tipc_bclink_wakeup_users(void); #endif diff --git a/net/tipc/config.c b/net/tipc/config.c index 2b42403ad33a..876f4c6a2631 100644 --- a/net/tipc/config.c +++ b/net/tipc/config.c @@ -35,7 +35,7 @@ */ #include "core.h" -#include "port.h" +#include "socket.h" #include "name_table.h" #include "config.h" #include "server.h" @@ -266,7 +266,7 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area rep_tlv_buf = tipc_media_get_names(); break; case TIPC_CMD_SHOW_PORTS: - rep_tlv_buf = tipc_port_get_ports(); + rep_tlv_buf = tipc_sk_socks_show(); break; case TIPC_CMD_SHOW_STATS: rep_tlv_buf = tipc_show_stats(); diff --git a/net/tipc/core.c b/net/tipc/core.c index 676d18015dd8..a5737b8407dd 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -35,11 +35,10 @@ */ #include "core.h" -#include "ref.h" #include "name_table.h" #include "subscr.h" #include "config.h" -#include "port.h" +#include "socket.h" #include <linux/module.h> @@ -85,7 +84,7 @@ static void tipc_core_stop(void) tipc_netlink_stop(); tipc_subscr_stop(); tipc_nametbl_stop(); - tipc_ref_table_stop(); + tipc_sk_ref_table_stop(); tipc_socket_stop(); tipc_unregister_sysctl(); } @@ -99,7 +98,7 @@ static int tipc_core_start(void) get_random_bytes(&tipc_random, sizeof(tipc_random)); - err = tipc_ref_table_init(tipc_max_ports, tipc_random); + err = tipc_sk_ref_table_init(tipc_max_ports, tipc_random); if (err) goto out_reftbl; @@ -139,7 +138,7 @@ out_socket: out_netlink: tipc_nametbl_stop(); out_nametbl: - tipc_ref_table_stop(); + tipc_sk_ref_table_stop(); out_reftbl: return err; } diff --git a/net/tipc/core.h b/net/tipc/core.h index bb26ed1ee966..f773b148722f 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -81,6 +81,7 @@ extern u32 tipc_own_addr __read_mostly; extern int tipc_max_ports __read_mostly; extern int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; +extern int sysctl_tipc_named_timeout __read_mostly; /* * Other global variables @@ -187,8 +188,11 @@ static inline void k_term_timer(struct timer_list *timer) struct tipc_skb_cb { void *handle; - bool deferred; struct sk_buff *tail; + bool deferred; + bool wakeup_pending; + u16 chain_sz; + u16 chain_imp; }; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) diff --git a/net/tipc/link.c b/net/tipc/link.c index ad2c57f5868d..65410e18b8a6 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -36,7 +36,6 @@ #include "core.h" #include "link.h" -#include "port.h" #include "socket.h" #include "name_distr.h" #include "discover.h" @@ -82,15 +81,13 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, struct sk_buff *buf); static int tipc_link_tunnel_rcv(struct tipc_node *n_ptr, struct sk_buff **buf); static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tolerance); -static int tipc_link_iovec_long_xmit(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destnode); static void link_state_event(struct tipc_link *l_ptr, u32 event); static void link_reset_statistics(struct tipc_link *l_ptr); static void link_print(struct tipc_link *l_ptr, const char *str); -static int tipc_link_frag_xmit(struct tipc_link *l_ptr, struct sk_buff *buf); static void tipc_link_sync_xmit(struct tipc_link *l); static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); +static int tipc_link_input(struct tipc_link *l, struct sk_buff *buf); +static int tipc_link_prepare_input(struct tipc_link *l, struct sk_buff **buf); /* * Simple link routines @@ -277,7 +274,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, link_init_max_pkt(l_ptr); l_ptr->next_out_no = 1; - INIT_LIST_HEAD(&l_ptr->waiting_ports); + __skb_queue_head_init(&l_ptr->waiting_sks); link_reset_statistics(l_ptr); @@ -324,62 +321,47 @@ void tipc_link_delete_list(unsigned int bearer_id, bool shutting_down) } /** - * link_schedule_port - schedule port for deferred sending - * @l_ptr: pointer to link - * @origport: reference to sending port - * @sz: amount of data to be sent - * - * Schedules port for renewed sending of messages after link congestion - * has abated. + * link_schedule_user - schedule user for wakeup after congestion + * @link: congested link + * @oport: sending port + * @chain_sz: size of buffer chain that was attempted sent + * @imp: importance of message attempted sent + * Create pseudo msg to send back to user when congestion abates */ -static int link_schedule_port(struct tipc_link *l_ptr, u32 origport, u32 sz) +static bool link_schedule_user(struct tipc_link *link, u32 oport, + uint chain_sz, uint imp) { - struct tipc_port *p_ptr; + struct sk_buff *buf; - spin_lock_bh(&tipc_port_list_lock); - p_ptr = tipc_port_lock(origport); - if (p_ptr) { - if (!list_empty(&p_ptr->wait_list)) - goto exit; - p_ptr->congested = 1; - p_ptr->waiting_pkts = 1 + ((sz - 1) / l_ptr->max_pkt); - list_add_tail(&p_ptr->wait_list, &l_ptr->waiting_ports); - l_ptr->stats.link_congs++; -exit: - tipc_port_unlock(p_ptr); - } - spin_unlock_bh(&tipc_port_list_lock); - return -ELINKCONG; + buf = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, tipc_own_addr, + tipc_own_addr, oport, 0, 0); + if (!buf) + return false; + TIPC_SKB_CB(buf)->chain_sz = chain_sz; + TIPC_SKB_CB(buf)->chain_imp = imp; + __skb_queue_tail(&link->waiting_sks, buf); + link->stats.link_congs++; + return true; } -void tipc_link_wakeup_ports(struct tipc_link *l_ptr, int all) +/** + * link_prepare_wakeup - prepare users for wakeup after congestion + * @link: congested link + * Move a number of waiting users, as permitted by available space in + * the send queue, from link wait queue to node wait queue for wakeup + */ +static void link_prepare_wakeup(struct tipc_link *link) { - struct tipc_port *p_ptr; - struct tipc_port *temp_p_ptr; - int win = l_ptr->queue_limit[0] - l_ptr->out_queue_size; + struct sk_buff_head *wq = &link->waiting_sks; + struct sk_buff *buf; + uint pend_qsz = link->out_queue_size; - if (all) - win = 100000; - if (win <= 0) - return; - if (!spin_trylock_bh(&tipc_port_list_lock)) - return; - if (link_congested(l_ptr)) - goto exit; - list_for_each_entry_safe(p_ptr, temp_p_ptr, &l_ptr->waiting_ports, - wait_list) { - if (win <= 0) + for (buf = skb_peek(wq); buf; buf = skb_peek(wq)) { + if (pend_qsz >= link->queue_limit[TIPC_SKB_CB(buf)->chain_imp]) break; - list_del_init(&p_ptr->wait_list); - spin_lock_bh(p_ptr->lock); - p_ptr->congested = 0; - tipc_port_wakeup(p_ptr); - win -= p_ptr->waiting_pkts; - spin_unlock_bh(p_ptr->lock); + pend_qsz += TIPC_SKB_CB(buf)->chain_sz; + __skb_queue_tail(&link->owner->waiting_sks, __skb_dequeue(wq)); } - -exit: - spin_unlock_bh(&tipc_port_list_lock); } /** @@ -421,6 +403,7 @@ void tipc_link_reset(struct tipc_link *l_ptr) u32 prev_state = l_ptr->state; u32 checkpoint = l_ptr->next_in_no; int was_active_link = tipc_link_is_active(l_ptr); + struct tipc_node *owner = l_ptr->owner; msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff)); @@ -448,9 +431,10 @@ void tipc_link_reset(struct tipc_link *l_ptr) kfree_skb(l_ptr->proto_msg_queue); l_ptr->proto_msg_queue = NULL; kfree_skb_list(l_ptr->oldest_deferred_in); - if (!list_empty(&l_ptr->waiting_ports)) - tipc_link_wakeup_ports(l_ptr, 1); - + if (!skb_queue_empty(&l_ptr->waiting_sks)) { + skb_queue_splice_init(&l_ptr->waiting_sks, &owner->waiting_sks); + owner->action_flags |= TIPC_WAKEUP_USERS; + } l_ptr->retransm_queue_head = 0; l_ptr->retransm_queue_size = 0; l_ptr->last_out = NULL; @@ -676,178 +660,146 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) } } -/* - * link_bundle_buf(): Append contents of a buffer to - * the tail of an existing one. +/* tipc_link_cong: determine return value and how to treat the + * sent buffer during link congestion. + * - For plain, errorless user data messages we keep the buffer and + * return -ELINKONG. + * - For all other messages we discard the buffer and return -EHOSTUNREACH + * - For TIPC internal messages we also reset the link */ -static int link_bundle_buf(struct tipc_link *l_ptr, struct sk_buff *bundler, - struct sk_buff *buf) +static int tipc_link_cong(struct tipc_link *link, struct sk_buff *buf) { - struct tipc_msg *bundler_msg = buf_msg(bundler); struct tipc_msg *msg = buf_msg(buf); - u32 size = msg_size(msg); - u32 bundle_size = msg_size(bundler_msg); - u32 to_pos = align(bundle_size); - u32 pad = to_pos - bundle_size; - - if (msg_user(bundler_msg) != MSG_BUNDLER) - return 0; - if (msg_type(bundler_msg) != OPEN_MSG) - return 0; - if (skb_tailroom(bundler) < (pad + size)) - return 0; - if (l_ptr->max_pkt < (to_pos + size)) - return 0; - - skb_put(bundler, pad + size); - skb_copy_to_linear_data_offset(bundler, to_pos, buf->data, size); - msg_set_size(bundler_msg, to_pos + size); - msg_set_msgcnt(bundler_msg, msg_msgcnt(bundler_msg) + 1); - kfree_skb(buf); - l_ptr->stats.sent_bundled++; - return 1; -} - -static void link_add_to_outqueue(struct tipc_link *l_ptr, - struct sk_buff *buf, - struct tipc_msg *msg) -{ - u32 ack = mod(l_ptr->next_in_no - 1); - u32 seqno = mod(l_ptr->next_out_no++); + uint imp = tipc_msg_tot_importance(msg); + u32 oport = msg_tot_origport(msg); - msg_set_word(msg, 2, ((ack << 16) | seqno)); - msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - buf->next = NULL; - if (l_ptr->first_out) { - l_ptr->last_out->next = buf; - l_ptr->last_out = buf; - } else - l_ptr->first_out = l_ptr->last_out = buf; - - l_ptr->out_queue_size++; - if (l_ptr->out_queue_size > l_ptr->stats.max_queue_sz) - l_ptr->stats.max_queue_sz = l_ptr->out_queue_size; -} - -static void link_add_chain_to_outqueue(struct tipc_link *l_ptr, - struct sk_buff *buf_chain, - u32 long_msgno) -{ - struct sk_buff *buf; - struct tipc_msg *msg; - - if (!l_ptr->next_out) - l_ptr->next_out = buf_chain; - while (buf_chain) { - buf = buf_chain; - buf_chain = buf_chain->next; - - msg = buf_msg(buf); - msg_set_long_msgno(msg, long_msgno); - link_add_to_outqueue(l_ptr, buf, msg); + if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { + pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); + tipc_link_reset(link); + goto drop; } + if (unlikely(msg_errcode(msg))) + goto drop; + if (unlikely(msg_reroute_cnt(msg))) + goto drop; + if (TIPC_SKB_CB(buf)->wakeup_pending) + return -ELINKCONG; + if (link_schedule_user(link, oport, TIPC_SKB_CB(buf)->chain_sz, imp)) + return -ELINKCONG; +drop: + kfree_skb_list(buf); + return -EHOSTUNREACH; } -/* - * tipc_link_xmit() is the 'full path' for messages, called from - * inside TIPC when the 'fast path' in tipc_send_xmit - * has failed, and from link_send() +/** + * __tipc_link_xmit(): same as tipc_link_xmit, but destlink is known & locked + * @link: link to use + * @buf: chain of buffers containing message + * Consumes the buffer chain, except when returning -ELINKCONG + * Returns 0 if success, otherwise errno: -ELINKCONG, -EMSGSIZE (plain socket + * user data messages) or -EHOSTUNREACH (all other messages/senders) + * Only the socket functions tipc_send_stream() and tipc_send_packet() need + * to act on the return value, since they may need to do more send attempts. */ -int __tipc_link_xmit(struct tipc_link *l_ptr, struct sk_buff *buf) +int __tipc_link_xmit(struct tipc_link *link, struct sk_buff *buf) { struct tipc_msg *msg = buf_msg(buf); - u32 size = msg_size(msg); - u32 dsz = msg_data_sz(msg); - u32 queue_size = l_ptr->out_queue_size; - u32 imp = tipc_msg_tot_importance(msg); - u32 queue_limit = l_ptr->queue_limit[imp]; - u32 max_packet = l_ptr->max_pkt; - - /* Match msg importance against queue limits: */ - if (unlikely(queue_size >= queue_limit)) { - if (imp <= TIPC_CRITICAL_IMPORTANCE) { - link_schedule_port(l_ptr, msg_origport(msg), size); - kfree_skb(buf); - return -ELINKCONG; - } - kfree_skb(buf); - if (imp > CONN_MANAGER) { - pr_warn("%s<%s>, send queue full", link_rst_msg, - l_ptr->name); - tipc_link_reset(l_ptr); - } - return dsz; + uint psz = msg_size(msg); + uint qsz = link->out_queue_size; + uint sndlim = link->queue_limit[0]; + uint imp = tipc_msg_tot_importance(msg); + uint mtu = link->max_pkt; + uint ack = mod(link->next_in_no - 1); + uint seqno = link->next_out_no; + uint bc_last_in = link->owner->bclink.last_in; + struct tipc_media_addr *addr = &link->media_addr; + struct sk_buff *next = buf->next; + + /* Match queue limits against msg importance: */ + if (unlikely(qsz >= link->queue_limit[imp])) + return tipc_link_cong(link, buf); + + /* Has valid packet limit been used ? */ + if (unlikely(psz > mtu)) { + kfree_skb_list(buf); + return -EMSGSIZE; } - /* Fragmentation needed ? */ - if (size > max_packet) - return tipc_link_frag_xmit(l_ptr, buf); - - /* Packet can be queued or sent. */ - if (likely(!link_congested(l_ptr))) { - link_add_to_outqueue(l_ptr, buf, msg); + /* Prepare each packet for sending, and add to outqueue: */ + while (buf) { + next = buf->next; + msg = buf_msg(buf); + msg_set_word(msg, 2, ((ack << 16) | mod(seqno))); + msg_set_bcast_ack(msg, bc_last_in); + + if (!link->first_out) { + link->first_out = buf; + } else if (qsz < sndlim) { + link->last_out->next = buf; + } else if (tipc_msg_bundle(link->last_out, buf, mtu)) { + link->stats.sent_bundled++; + buf = next; + next = buf->next; + continue; + } else if (tipc_msg_make_bundle(&buf, mtu, link->addr)) { + link->stats.sent_bundled++; + link->stats.sent_bundles++; + link->last_out->next = buf; + if (!link->next_out) + link->next_out = buf; + } else { + link->last_out->next = buf; + if (!link->next_out) + link->next_out = buf; + } - tipc_bearer_send(l_ptr->bearer_id, buf, &l_ptr->media_addr); - l_ptr->unacked_window = 0; - return dsz; - } - /* Congestion: can message be bundled ? */ - if ((msg_user(msg) != CHANGEOVER_PROTOCOL) && - (msg_user(msg) != MSG_FRAGMENTER)) { - - /* Try adding message to an existing bundle */ - if (l_ptr->next_out && - link_bundle_buf(l_ptr, l_ptr->last_out, buf)) - return dsz; - - /* Try creating a new bundle */ - if (size <= max_packet * 2 / 3) { - struct sk_buff *bundler = tipc_buf_acquire(max_packet); - struct tipc_msg bundler_hdr; - - if (bundler) { - tipc_msg_init(&bundler_hdr, MSG_BUNDLER, OPEN_MSG, - INT_H_SIZE, l_ptr->addr); - skb_copy_to_linear_data(bundler, &bundler_hdr, - INT_H_SIZE); - skb_trim(bundler, INT_H_SIZE); - link_bundle_buf(l_ptr, bundler, buf); - buf = bundler; - msg = buf_msg(buf); - l_ptr->stats.sent_bundles++; - } + /* Send packet if possible: */ + if (likely(++qsz <= sndlim)) { + tipc_bearer_send(link->bearer_id, buf, addr); + link->next_out = next; + link->unacked_window = 0; } + seqno++; + link->last_out = buf; + buf = next; } - if (!l_ptr->next_out) - l_ptr->next_out = buf; - link_add_to_outqueue(l_ptr, buf, msg); - return dsz; + link->next_out_no = seqno; + link->out_queue_size = qsz; + return 0; } -/* - * tipc_link_xmit(): same as __tipc_link_xmit(), but the link to use - * has not been selected yet, and the the owner node is not locked - * Called by TIPC internal users, e.g. the name distributor +/** + * tipc_link_xmit() is the general link level function for message sending + * @buf: chain of buffers containing message + * @dsz: amount of user data to be sent + * @dnode: address of destination node + * @selector: a number used for deterministic link selection + * Consumes the buffer chain, except when returning -ELINKCONG + * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE */ -int tipc_link_xmit(struct sk_buff *buf, u32 dest, u32 selector) +int tipc_link_xmit(struct sk_buff *buf, u32 dnode, u32 selector) { - struct tipc_link *l_ptr; - struct tipc_node *n_ptr; - int res = -ELINKCONG; + struct tipc_link *link = NULL; + struct tipc_node *node; + int rc = -EHOSTUNREACH; - n_ptr = tipc_node_find(dest); - if (n_ptr) { - tipc_node_lock(n_ptr); - l_ptr = n_ptr->active_links[selector & 1]; - if (l_ptr) - res = __tipc_link_xmit(l_ptr, buf); - else - kfree_skb(buf); - tipc_node_unlock(n_ptr); - } else { - kfree_skb(buf); + node = tipc_node_find(dnode); + if (node) { + tipc_node_lock(node); + link = node->active_links[selector & 1]; + if (link) + rc = __tipc_link_xmit(link, buf); + tipc_node_unlock(node); } - return res; + + if (link) + return rc; + + if (likely(in_own_node(dnode))) + return tipc_sk_rcv(buf); + + kfree_skb_list(buf); + return rc; } /* @@ -858,7 +810,7 @@ int tipc_link_xmit(struct sk_buff *buf, u32 dest, u32 selector) * * Called with node locked */ -static void tipc_link_sync_xmit(struct tipc_link *l) +static void tipc_link_sync_xmit(struct tipc_link *link) { struct sk_buff *buf; struct tipc_msg *msg; @@ -868,10 +820,9 @@ static void tipc_link_sync_xmit(struct tipc_link *l) return; msg = buf_msg(buf); - tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, l->addr); - msg_set_last_bcast(msg, l->owner->bclink.acked); - link_add_chain_to_outqueue(l, buf, 0); - tipc_link_push_queue(l); + tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, link->addr); + msg_set_last_bcast(msg, link->owner->bclink.acked); + __tipc_link_xmit(link, buf); } /* @@ -892,293 +843,6 @@ static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf) } /* - * tipc_link_names_xmit - send name table entries to new neighbor - * - * Send routine for bulk delivery of name table messages when contact - * with a new neighbor occurs. No link congestion checking is performed - * because name table messages *must* be delivered. The messages must be - * small enough not to require fragmentation. - * Called without any locks held. - */ -void tipc_link_names_xmit(struct list_head *message_list, u32 dest) -{ - struct tipc_node *n_ptr; - struct tipc_link *l_ptr; - struct sk_buff *buf; - struct sk_buff *temp_buf; - - if (list_empty(message_list)) - return; - - n_ptr = tipc_node_find(dest); - if (n_ptr) { - tipc_node_lock(n_ptr); - l_ptr = n_ptr->active_links[0]; - if (l_ptr) { - /* convert circular list to linear list */ - ((struct sk_buff *)message_list->prev)->next = NULL; - link_add_chain_to_outqueue(l_ptr, - (struct sk_buff *)message_list->next, 0); - tipc_link_push_queue(l_ptr); - INIT_LIST_HEAD(message_list); - } - tipc_node_unlock(n_ptr); - } - - /* discard the messages if they couldn't be sent */ - list_for_each_safe(buf, temp_buf, ((struct sk_buff *)message_list)) { - list_del((struct list_head *)buf); - kfree_skb(buf); - } -} - -/* - * tipc_link_xmit_fast: Entry for data messages where the - * destination link is known and the header is complete, - * inclusive total message length. Very time critical. - * Link is locked. Returns user data length. - */ -static int tipc_link_xmit_fast(struct tipc_link *l_ptr, struct sk_buff *buf, - u32 *used_max_pkt) -{ - struct tipc_msg *msg = buf_msg(buf); - int res = msg_data_sz(msg); - - if (likely(!link_congested(l_ptr))) { - if (likely(msg_size(msg) <= l_ptr->max_pkt)) { - link_add_to_outqueue(l_ptr, buf, msg); - tipc_bearer_send(l_ptr->bearer_id, buf, - &l_ptr->media_addr); - l_ptr->unacked_window = 0; - return res; - } - else - *used_max_pkt = l_ptr->max_pkt; - } - return __tipc_link_xmit(l_ptr, buf); /* All other cases */ -} - -/* - * tipc_link_iovec_xmit_fast: Entry for messages where the - * destination processor is known and the header is complete, - * except for total message length. - * Returns user data length or errno. - */ -int tipc_link_iovec_xmit_fast(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destaddr) -{ - struct tipc_msg *hdr = &sender->phdr; - struct tipc_link *l_ptr; - struct sk_buff *buf; - struct tipc_node *node; - int res; - u32 selector = msg_origport(hdr) & 1; - -again: - /* - * Try building message using port's max_pkt hint. - * (Must not hold any locks while building message.) - */ - res = tipc_msg_build(hdr, msg_sect, len, sender->max_pkt, &buf); - /* Exit if build request was invalid */ - if (unlikely(res < 0)) - return res; - - node = tipc_node_find(destaddr); - if (likely(node)) { - tipc_node_lock(node); - l_ptr = node->active_links[selector]; - if (likely(l_ptr)) { - if (likely(buf)) { - res = tipc_link_xmit_fast(l_ptr, buf, - &sender->max_pkt); -exit: - tipc_node_unlock(node); - return res; - } - - /* Exit if link (or bearer) is congested */ - if (link_congested(l_ptr)) { - res = link_schedule_port(l_ptr, - sender->ref, res); - goto exit; - } - - /* - * Message size exceeds max_pkt hint; update hint, - * then re-try fast path or fragment the message - */ - sender->max_pkt = l_ptr->max_pkt; - tipc_node_unlock(node); - - - if ((msg_hdr_sz(hdr) + res) <= sender->max_pkt) - goto again; - - return tipc_link_iovec_long_xmit(sender, msg_sect, - len, destaddr); - } - tipc_node_unlock(node); - } - - /* Couldn't find a link to the destination node */ - kfree_skb(buf); - tipc_port_iovec_reject(sender, hdr, msg_sect, len, TIPC_ERR_NO_NODE); - return -ENETUNREACH; -} - -/* - * tipc_link_iovec_long_xmit(): Entry for long messages where the - * destination node is known and the header is complete, - * inclusive total message length. - * Link and bearer congestion status have been checked to be ok, - * and are ignored if they change. - * - * Note that fragments do not use the full link MTU so that they won't have - * to undergo refragmentation if link changeover causes them to be sent - * over another link with an additional tunnel header added as prefix. - * (Refragmentation will still occur if the other link has a smaller MTU.) - * - * Returns user data length or errno. - */ -static int tipc_link_iovec_long_xmit(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destaddr) -{ - struct tipc_link *l_ptr; - struct tipc_node *node; - struct tipc_msg *hdr = &sender->phdr; - u32 dsz = len; - u32 max_pkt, fragm_sz, rest; - struct tipc_msg fragm_hdr; - struct sk_buff *buf, *buf_chain, *prev; - u32 fragm_crs, fragm_rest, hsz, sect_rest; - const unchar __user *sect_crs; - int curr_sect; - u32 fragm_no; - int res = 0; - -again: - fragm_no = 1; - max_pkt = sender->max_pkt - INT_H_SIZE; - /* leave room for tunnel header in case of link changeover */ - fragm_sz = max_pkt - INT_H_SIZE; - /* leave room for fragmentation header in each fragment */ - rest = dsz; - fragm_crs = 0; - fragm_rest = 0; - sect_rest = 0; - sect_crs = NULL; - curr_sect = -1; - - /* Prepare reusable fragment header */ - tipc_msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT, - INT_H_SIZE, msg_destnode(hdr)); - msg_set_size(&fragm_hdr, max_pkt); - msg_set_fragm_no(&fragm_hdr, 1); - - /* Prepare header of first fragment */ - buf_chain = buf = tipc_buf_acquire(max_pkt); - if (!buf) - return -ENOMEM; - buf->next = NULL; - skb_copy_to_linear_data(buf, &fragm_hdr, INT_H_SIZE); - hsz = msg_hdr_sz(hdr); - skb_copy_to_linear_data_offset(buf, INT_H_SIZE, hdr, hsz); - - /* Chop up message */ - fragm_crs = INT_H_SIZE + hsz; - fragm_rest = fragm_sz - hsz; - - do { /* For all sections */ - u32 sz; - - if (!sect_rest) { - sect_rest = msg_sect[++curr_sect].iov_len; - sect_crs = msg_sect[curr_sect].iov_base; - } - - if (sect_rest < fragm_rest) - sz = sect_rest; - else - sz = fragm_rest; - - if (copy_from_user(buf->data + fragm_crs, sect_crs, sz)) { - res = -EFAULT; -error: - kfree_skb_list(buf_chain); - return res; - } - sect_crs += sz; - sect_rest -= sz; - fragm_crs += sz; - fragm_rest -= sz; - rest -= sz; - - if (!fragm_rest && rest) { - - /* Initiate new fragment: */ - if (rest <= fragm_sz) { - fragm_sz = rest; - msg_set_type(&fragm_hdr, LAST_FRAGMENT); - } else { - msg_set_type(&fragm_hdr, FRAGMENT); - } - msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); - msg_set_fragm_no(&fragm_hdr, ++fragm_no); - prev = buf; - buf = tipc_buf_acquire(fragm_sz + INT_H_SIZE); - if (!buf) { - res = -ENOMEM; - goto error; - } - - buf->next = NULL; - prev->next = buf; - skb_copy_to_linear_data(buf, &fragm_hdr, INT_H_SIZE); - fragm_crs = INT_H_SIZE; - fragm_rest = fragm_sz; - } - } while (rest > 0); - - /* - * Now we have a buffer chain. Select a link and check - * that packet size is still OK - */ - node = tipc_node_find(destaddr); - if (likely(node)) { - tipc_node_lock(node); - l_ptr = node->active_links[sender->ref & 1]; - if (!l_ptr) { - tipc_node_unlock(node); - goto reject; - } - if (l_ptr->max_pkt < max_pkt) { - sender->max_pkt = l_ptr->max_pkt; - tipc_node_unlock(node); - kfree_skb_list(buf_chain); - goto again; - } - } else { -reject: - kfree_skb_list(buf_chain); - tipc_port_iovec_reject(sender, hdr, msg_sect, len, - TIPC_ERR_NO_NODE); - return -ENETUNREACH; - } - - /* Append chain of fragments to send queue & send them */ - l_ptr->long_msg_seq_no++; - link_add_chain_to_outqueue(l_ptr, buf_chain, l_ptr->long_msg_seq_no); - l_ptr->stats.sent_fragments += fragm_no; - l_ptr->stats.sent_fragmented++; - tipc_link_push_queue(l_ptr); - tipc_node_unlock(node); - return dsz; -} - -/* * tipc_link_push_packet: Push one unsent packet to the media */ static u32 tipc_link_push_packet(struct tipc_link *l_ptr) @@ -1238,7 +902,7 @@ static u32 tipc_link_push_packet(struct tipc_link *l_ptr) tipc_bearer_send(l_ptr->bearer_id, buf, &l_ptr->media_addr); if (msg_user(msg) == MSG_BUNDLER) - msg_set_type(msg, CLOSED_MSG); + msg_set_type(msg, BUNDLE_CLOSED); l_ptr->next_out = buf->next; return 0; } @@ -1524,12 +1188,9 @@ void tipc_rcv(struct sk_buff *head, struct tipc_bearer *b_ptr) if (unlikely(l_ptr->next_out)) tipc_link_push_queue(l_ptr); - if (unlikely(!list_empty(&l_ptr->waiting_ports))) - tipc_link_wakeup_ports(l_ptr, 0); - - if (unlikely(++l_ptr->unacked_window >= TIPC_MIN_LINK_WIN)) { - l_ptr->stats.sent_acks++; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); + if (released && !skb_queue_empty(&l_ptr->waiting_sks)) { + link_prepare_wakeup(l_ptr); + l_ptr->owner->action_flags |= TIPC_WAKEUP_USERS; } /* Process the incoming packet */ @@ -1565,57 +1226,19 @@ void tipc_rcv(struct sk_buff *head, struct tipc_bearer *b_ptr) if (unlikely(l_ptr->oldest_deferred_in)) head = link_insert_deferred_queue(l_ptr, head); - /* Deliver packet/message to correct user: */ - if (unlikely(msg_user(msg) == CHANGEOVER_PROTOCOL)) { - if (!tipc_link_tunnel_rcv(n_ptr, &buf)) { - tipc_node_unlock(n_ptr); - continue; - } - msg = buf_msg(buf); - } else if (msg_user(msg) == MSG_FRAGMENTER) { - l_ptr->stats.recv_fragments++; - if (tipc_buf_append(&l_ptr->reasm_buf, &buf)) { - l_ptr->stats.recv_fragmented++; - msg = buf_msg(buf); - } else { - if (!l_ptr->reasm_buf) - tipc_link_reset(l_ptr); - tipc_node_unlock(n_ptr); - continue; - } + if (unlikely(++l_ptr->unacked_window >= TIPC_MIN_LINK_WIN)) { + l_ptr->stats.sent_acks++; + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0, 0); } - switch (msg_user(msg)) { - case TIPC_LOW_IMPORTANCE: - case TIPC_MEDIUM_IMPORTANCE: - case TIPC_HIGH_IMPORTANCE: - case TIPC_CRITICAL_IMPORTANCE: - tipc_node_unlock(n_ptr); - tipc_sk_rcv(buf); - continue; - case MSG_BUNDLER: - l_ptr->stats.recv_bundles++; - l_ptr->stats.recv_bundled += msg_msgcnt(msg); - tipc_node_unlock(n_ptr); - tipc_link_bundle_rcv(buf); - continue; - case NAME_DISTRIBUTOR: - n_ptr->bclink.recv_permitted = true; - tipc_node_unlock(n_ptr); - tipc_named_rcv(buf); - continue; - case CONN_MANAGER: + if (tipc_link_prepare_input(l_ptr, &buf)) { tipc_node_unlock(n_ptr); - tipc_port_proto_rcv(buf); continue; - case BCAST_PROTOCOL: - tipc_link_sync_rcv(n_ptr, buf); - break; - default: - kfree_skb(buf); - break; } tipc_node_unlock(n_ptr); + msg = buf_msg(buf); + if (tipc_link_input(l_ptr, buf) != 0) + goto discard; continue; unlock_discard: tipc_node_unlock(n_ptr); @@ -1625,6 +1248,80 @@ discard: } /** + * tipc_link_prepare_input - process TIPC link messages + * + * returns nonzero if the message was consumed + * + * Node lock must be held + */ +static int tipc_link_prepare_input(struct tipc_link *l, struct sk_buff **buf) +{ + struct tipc_node *n; + struct tipc_msg *msg; + int res = -EINVAL; + + n = l->owner; + msg = buf_msg(*buf); + switch (msg_user(msg)) { + case CHANGEOVER_PROTOCOL: + if (tipc_link_tunnel_rcv(n, buf)) + res = 0; + break; + case MSG_FRAGMENTER: + l->stats.recv_fragments++; + if (tipc_buf_append(&l->reasm_buf, buf)) { + l->stats.recv_fragmented++; + res = 0; + } else if (!l->reasm_buf) { + tipc_link_reset(l); + } + break; + case MSG_BUNDLER: + l->stats.recv_bundles++; + l->stats.recv_bundled += msg_msgcnt(msg); + res = 0; + break; + case NAME_DISTRIBUTOR: + n->bclink.recv_permitted = true; + res = 0; + break; + case BCAST_PROTOCOL: + tipc_link_sync_rcv(n, *buf); + break; + default: + res = 0; + } + return res; +} +/** + * tipc_link_input - Deliver message too higher layers + */ +static int tipc_link_input(struct tipc_link *l, struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + int res = 0; + + switch (msg_user(msg)) { + case TIPC_LOW_IMPORTANCE: + case TIPC_MEDIUM_IMPORTANCE: + case TIPC_HIGH_IMPORTANCE: + case TIPC_CRITICAL_IMPORTANCE: + case CONN_MANAGER: + tipc_sk_rcv(buf); + break; + case NAME_DISTRIBUTOR: + tipc_named_rcv(buf); + break; + case MSG_BUNDLER: + tipc_link_bundle_rcv(buf); + break; + default: + res = -EINVAL; + } + return res; +} + +/** * tipc_link_defer_pkt - Add out-of-sequence message to deferred reception queue * * Returns increase in queue length (i.e. 0 or 1) @@ -2217,6 +1914,7 @@ void tipc_link_bundle_rcv(struct sk_buff *buf) u32 msgcount = msg_msgcnt(buf_msg(buf)); u32 pos = INT_H_SIZE; struct sk_buff *obuf; + struct tipc_msg *omsg; while (msgcount--) { obuf = buf_extract(buf, pos); @@ -2224,82 +1922,18 @@ void tipc_link_bundle_rcv(struct sk_buff *buf) pr_warn("Link unable to unbundle message(s)\n"); break; } - pos += align(msg_size(buf_msg(obuf))); - tipc_net_route_msg(obuf); - } - kfree_skb(buf); -} - -/* - * Fragmentation/defragmentation: - */ - -/* - * tipc_link_frag_xmit: Entry for buffers needing fragmentation. - * The buffer is complete, inclusive total message length. - * Returns user data length. - */ -static int tipc_link_frag_xmit(struct tipc_link *l_ptr, struct sk_buff *buf) -{ - struct sk_buff *buf_chain = NULL; - struct sk_buff *buf_chain_tail = (struct sk_buff *)&buf_chain; - struct tipc_msg *inmsg = buf_msg(buf); - struct tipc_msg fragm_hdr; - u32 insize = msg_size(inmsg); - u32 dsz = msg_data_sz(inmsg); - unchar *crs = buf->data; - u32 rest = insize; - u32 pack_sz = l_ptr->max_pkt; - u32 fragm_sz = pack_sz - INT_H_SIZE; - u32 fragm_no = 0; - u32 destaddr; - - if (msg_short(inmsg)) - destaddr = l_ptr->addr; - else - destaddr = msg_destnode(inmsg); - - /* Prepare reusable fragment header: */ - tipc_msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT, - INT_H_SIZE, destaddr); - - /* Chop up message: */ - while (rest > 0) { - struct sk_buff *fragm; - - if (rest <= fragm_sz) { - fragm_sz = rest; - msg_set_type(&fragm_hdr, LAST_FRAGMENT); - } - fragm = tipc_buf_acquire(fragm_sz + INT_H_SIZE); - if (fragm == NULL) { - kfree_skb(buf); - kfree_skb_list(buf_chain); - return -ENOMEM; + omsg = buf_msg(obuf); + pos += align(msg_size(omsg)); + if (msg_isdata(omsg) || (msg_user(omsg) == CONN_MANAGER)) { + tipc_sk_rcv(obuf); + } else if (msg_user(omsg) == NAME_DISTRIBUTOR) { + tipc_named_rcv(obuf); + } else { + pr_warn("Illegal bundled msg: %u\n", msg_user(omsg)); + kfree_skb(obuf); } - msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); - fragm_no++; - msg_set_fragm_no(&fragm_hdr, fragm_no); - skb_copy_to_linear_data(fragm, &fragm_hdr, INT_H_SIZE); - skb_copy_to_linear_data_offset(fragm, INT_H_SIZE, crs, - fragm_sz); - buf_chain_tail->next = fragm; - buf_chain_tail = fragm; - - rest -= fragm_sz; - crs += fragm_sz; - msg_set_type(&fragm_hdr, FRAGMENT); } kfree_skb(buf); - - /* Append chain of fragments to send queue & send them */ - l_ptr->long_msg_seq_no++; - link_add_chain_to_outqueue(l_ptr, buf_chain, l_ptr->long_msg_seq_no); - l_ptr->stats.sent_fragments += fragm_no; - l_ptr->stats.sent_fragmented++; - tipc_link_push_queue(l_ptr); - - return dsz; } static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tolerance) diff --git a/net/tipc/link.h b/net/tipc/link.h index 200d518b218e..b567a3427fda 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -1,7 +1,7 @@ /* * net/tipc/link.h: Include file for TIPC link code * - * Copyright (c) 1995-2006, 2013, Ericsson AB + * Copyright (c) 1995-2006, 2013-2014, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -133,7 +133,7 @@ struct tipc_stats { * @retransm_queue_size: number of messages to retransmit * @retransm_queue_head: sequence number of first message to retransmit * @next_out: ptr to first unsent outbound message in queue - * @waiting_ports: linked list of ports waiting for link congestion to abate + * @waiting_sks: linked list of sockets waiting for link congestion to abate * @long_msg_seq_no: next identifier to use for outbound fragmented messages * @reasm_buf: head of partially reassembled inbound message fragments * @stats: collects statistics regarding link activity @@ -194,7 +194,7 @@ struct tipc_link { u32 retransm_queue_size; u32 retransm_queue_head; struct sk_buff *next_out; - struct list_head waiting_ports; + struct sk_buff_head waiting_sks; /* Fragmentation/reassembly */ u32 long_msg_seq_no; @@ -227,20 +227,14 @@ void tipc_link_reset_all(struct tipc_node *node); void tipc_link_reset(struct tipc_link *l_ptr); void tipc_link_reset_list(unsigned int bearer_id); int tipc_link_xmit(struct sk_buff *buf, u32 dest, u32 selector); -void tipc_link_names_xmit(struct list_head *message_list, u32 dest); -int __tipc_link_xmit(struct tipc_link *l_ptr, struct sk_buff *buf); -int tipc_link_send_buf(struct tipc_link *l_ptr, struct sk_buff *buf); +int __tipc_link_xmit(struct tipc_link *link, struct sk_buff *buf); u32 tipc_link_get_max_pkt(u32 dest, u32 selector); -int tipc_link_iovec_xmit_fast(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len, u32 destnode); void tipc_link_bundle_rcv(struct sk_buff *buf); void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob, u32 gap, u32 tolerance, u32 priority, u32 acked_mtu); void tipc_link_push_queue(struct tipc_link *l_ptr); u32 tipc_link_defer_pkt(struct sk_buff **head, struct sk_buff **tail, struct sk_buff *buf); -void tipc_link_wakeup_ports(struct tipc_link *l_ptr, int all); void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window); void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *start, u32 retransmits); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 0a37a472c29f..74745a47d72a 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -36,21 +36,16 @@ #include "core.h" #include "msg.h" +#include "addr.h" +#include "name_table.h" -u32 tipc_msg_tot_importance(struct tipc_msg *m) +#define MAX_FORWARD_SIZE 1024 + +static unsigned int align(unsigned int i) { - if (likely(msg_isdata(m))) { - if (likely(msg_orignode(m) == tipc_own_addr)) - return msg_importance(m); - return msg_importance(m) + 4; - } - if ((msg_user(m) == MSG_FRAGMENTER) && - (msg_type(m) == FIRST_FRAGMENT)) - return msg_importance(msg_get_wrapped(m)); - return msg_importance(m); + return (i + 3) & ~3u; } - void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode) { @@ -61,43 +56,35 @@ void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type, u32 hsize, msg_set_size(m, hsize); msg_set_prevnode(m, tipc_own_addr); msg_set_type(m, type); - msg_set_orignode(m, tipc_own_addr); - msg_set_destnode(m, destnode); + if (hsize > SHORT_H_SIZE) { + msg_set_orignode(m, tipc_own_addr); + msg_set_destnode(m, destnode); + } } -/** - * tipc_msg_build - create message using specified header and data - * - * Note: Caller must not hold any locks in case copy_from_user() is interrupted! - * - * Returns message data size or errno - */ -int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect, - unsigned int len, int max_size, struct sk_buff **buf) +struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, + uint data_sz, u32 dnode, u32 onode, + u32 dport, u32 oport, int errcode) { - int dsz, sz, hsz; - unsigned char *to; - - dsz = len; - hsz = msg_hdr_sz(hdr); - sz = hsz + dsz; - msg_set_size(hdr, sz); - if (unlikely(sz > max_size)) { - *buf = NULL; - return dsz; - } + struct tipc_msg *msg; + struct sk_buff *buf; - *buf = tipc_buf_acquire(sz); - if (!(*buf)) - return -ENOMEM; - skb_copy_to_linear_data(*buf, hdr, hsz); - to = (*buf)->data + hsz; - if (len && memcpy_fromiovecend(to, msg_sect, 0, dsz)) { - kfree_skb(*buf); - *buf = NULL; - return -EFAULT; + buf = tipc_buf_acquire(hdr_sz + data_sz); + if (unlikely(!buf)) + return NULL; + + msg = buf_msg(buf); + tipc_msg_init(msg, user, type, hdr_sz, dnode); + msg_set_size(msg, hdr_sz + data_sz); + msg_set_prevnode(msg, onode); + msg_set_origport(msg, oport); + msg_set_destport(msg, dport); + msg_set_errcode(msg, errcode); + if (hdr_sz > SHORT_H_SIZE) { + msg_set_orignode(msg, onode); + msg_set_destnode(msg, dnode); } - return dsz; + return buf; } /* tipc_buf_append(): Append a buffer to the fragment list of another buffer @@ -112,27 +99,38 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) struct sk_buff *head = *headbuf; struct sk_buff *frag = *buf; struct sk_buff *tail; - struct tipc_msg *msg = buf_msg(frag); - u32 fragid = msg_type(msg); - bool headstolen; + struct tipc_msg *msg; + u32 fragid; int delta; + bool headstolen; + + if (!frag) + goto err; + msg = buf_msg(frag); + fragid = msg_type(msg); + frag->next = NULL; skb_pull(frag, msg_hdr_sz(msg)); if (fragid == FIRST_FRAGMENT) { - if (head || skb_unclone(frag, GFP_ATOMIC)) - goto out_free; + if (unlikely(head)) + goto err; + if (unlikely(skb_unclone(frag, GFP_ATOMIC))) + goto err; head = *headbuf = frag; skb_frag_list_init(head); + TIPC_SKB_CB(head)->tail = NULL; *buf = NULL; return 0; } + if (!head) - goto out_free; - tail = TIPC_SKB_CB(head)->tail; + goto err; + if (skb_try_coalesce(head, frag, &headstolen, &delta)) { kfree_skb_partial(frag, headstolen); } else { + tail = TIPC_SKB_CB(head)->tail; if (!skb_has_frag_list(head)) skb_shinfo(head)->frag_list = frag; else @@ -142,6 +140,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) head->len += frag->len; TIPC_SKB_CB(head)->tail = frag; } + if (fragid == LAST_FRAGMENT) { *buf = head; TIPC_SKB_CB(head)->tail = NULL; @@ -150,10 +149,314 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) } *buf = NULL; return 0; -out_free: + +err: pr_warn_ratelimited("Unable to build fragment list\n"); kfree_skb(*buf); kfree_skb(*headbuf); *buf = *headbuf = NULL; return 0; } + + +/** + * tipc_msg_build - create buffer chain containing specified header and data + * @mhdr: Message header, to be prepended to data + * @iov: User data + * @offset: Posision in iov to start copying from + * @dsz: Total length of user data + * @pktmax: Max packet size that can be used + * @chain: Buffer or chain of buffers to be returned to caller + * Returns message data size or errno: -ENOMEM, -EFAULT + */ +int tipc_msg_build(struct tipc_msg *mhdr, struct iovec const *iov, + int offset, int dsz, int pktmax , struct sk_buff **chain) +{ + int mhsz = msg_hdr_sz(mhdr); + int msz = mhsz + dsz; + int pktno = 1; + int pktsz; + int pktrem = pktmax; + int drem = dsz; + struct tipc_msg pkthdr; + struct sk_buff *buf, *prev; + char *pktpos; + int rc; + uint chain_sz = 0; + msg_set_size(mhdr, msz); + + /* No fragmentation needed? */ + if (likely(msz <= pktmax)) { + buf = tipc_buf_acquire(msz); + *chain = buf; + if (unlikely(!buf)) + return -ENOMEM; + skb_copy_to_linear_data(buf, mhdr, mhsz); + pktpos = buf->data + mhsz; + TIPC_SKB_CB(buf)->chain_sz = 1; + if (!dsz || !memcpy_fromiovecend(pktpos, iov, offset, dsz)) + return dsz; + rc = -EFAULT; + goto error; + } + + /* Prepare reusable fragment header */ + tipc_msg_init(&pkthdr, MSG_FRAGMENTER, FIRST_FRAGMENT, + INT_H_SIZE, msg_destnode(mhdr)); + msg_set_size(&pkthdr, pktmax); + msg_set_fragm_no(&pkthdr, pktno); + + /* Prepare first fragment */ + *chain = buf = tipc_buf_acquire(pktmax); + if (!buf) + return -ENOMEM; + chain_sz = 1; + pktpos = buf->data; + skb_copy_to_linear_data(buf, &pkthdr, INT_H_SIZE); + pktpos += INT_H_SIZE; + pktrem -= INT_H_SIZE; + skb_copy_to_linear_data_offset(buf, INT_H_SIZE, mhdr, mhsz); + pktpos += mhsz; + pktrem -= mhsz; + + do { + if (drem < pktrem) + pktrem = drem; + + if (memcpy_fromiovecend(pktpos, iov, offset, pktrem)) { + rc = -EFAULT; + goto error; + } + drem -= pktrem; + offset += pktrem; + + if (!drem) + break; + + /* Prepare new fragment: */ + if (drem < (pktmax - INT_H_SIZE)) + pktsz = drem + INT_H_SIZE; + else + pktsz = pktmax; + prev = buf; + buf = tipc_buf_acquire(pktsz); + if (!buf) { + rc = -ENOMEM; + goto error; + } + chain_sz++; + prev->next = buf; + msg_set_type(&pkthdr, FRAGMENT); + msg_set_size(&pkthdr, pktsz); + msg_set_fragm_no(&pkthdr, ++pktno); + skb_copy_to_linear_data(buf, &pkthdr, INT_H_SIZE); + pktpos = buf->data + INT_H_SIZE; + pktrem = pktsz - INT_H_SIZE; + + } while (1); + TIPC_SKB_CB(*chain)->chain_sz = chain_sz; + msg_set_type(buf_msg(buf), LAST_FRAGMENT); + return dsz; +error: + kfree_skb_list(*chain); + *chain = NULL; + return rc; +} + +/** + * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one + * @bbuf: the existing buffer ("bundle") + * @buf: buffer to be appended + * @mtu: max allowable size for the bundle buffer + * Consumes buffer if successful + * Returns true if bundling could be performed, otherwise false + */ +bool tipc_msg_bundle(struct sk_buff *bbuf, struct sk_buff *buf, u32 mtu) +{ + struct tipc_msg *bmsg = buf_msg(bbuf); + struct tipc_msg *msg = buf_msg(buf); + unsigned int bsz = msg_size(bmsg); + unsigned int msz = msg_size(msg); + u32 start = align(bsz); + u32 max = mtu - INT_H_SIZE; + u32 pad = start - bsz; + + if (likely(msg_user(msg) == MSG_FRAGMENTER)) + return false; + if (unlikely(msg_user(msg) == CHANGEOVER_PROTOCOL)) + return false; + if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) + return false; + if (likely(msg_user(bmsg) != MSG_BUNDLER)) + return false; + if (likely(msg_type(bmsg) != BUNDLE_OPEN)) + return false; + if (unlikely(skb_tailroom(bbuf) < (pad + msz))) + return false; + if (unlikely(max < (start + msz))) + return false; + + skb_put(bbuf, pad + msz); + skb_copy_to_linear_data_offset(bbuf, start, buf->data, msz); + msg_set_size(bmsg, start + msz); + msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); + bbuf->next = buf->next; + kfree_skb(buf); + return true; +} + +/** + * tipc_msg_make_bundle(): Create bundle buf and append message to its tail + * @buf: buffer to be appended and replaced + * @mtu: max allowable size for the bundle buffer, inclusive header + * @dnode: destination node for message. (Not always present in header) + * Replaces buffer if successful + * Returns true if sucess, otherwise false + */ +bool tipc_msg_make_bundle(struct sk_buff **buf, u32 mtu, u32 dnode) +{ + struct sk_buff *bbuf; + struct tipc_msg *bmsg; + struct tipc_msg *msg = buf_msg(*buf); + u32 msz = msg_size(msg); + u32 max = mtu - INT_H_SIZE; + + if (msg_user(msg) == MSG_FRAGMENTER) + return false; + if (msg_user(msg) == CHANGEOVER_PROTOCOL) + return false; + if (msg_user(msg) == BCAST_PROTOCOL) + return false; + if (msz > (max / 2)) + return false; + + bbuf = tipc_buf_acquire(max); + if (!bbuf) + return false; + + skb_trim(bbuf, INT_H_SIZE); + bmsg = buf_msg(bbuf); + tipc_msg_init(bmsg, MSG_BUNDLER, BUNDLE_OPEN, INT_H_SIZE, dnode); + msg_set_seqno(bmsg, msg_seqno(msg)); + msg_set_ack(bmsg, msg_ack(msg)); + msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); + bbuf->next = (*buf)->next; + tipc_msg_bundle(bbuf, *buf, mtu); + *buf = bbuf; + return true; +} + +/** + * tipc_msg_reverse(): swap source and destination addresses and add error code + * @buf: buffer containing message to be reversed + * @dnode: return value: node where to send message after reversal + * @err: error code to be set in message + * Consumes buffer if failure + * Returns true if success, otherwise false + */ +bool tipc_msg_reverse(struct sk_buff *buf, u32 *dnode, int err) +{ + struct tipc_msg *msg = buf_msg(buf); + uint imp = msg_importance(msg); + struct tipc_msg ohdr; + uint rdsz = min_t(uint, msg_data_sz(msg), MAX_FORWARD_SIZE); + + if (skb_linearize(buf)) + goto exit; + if (msg_dest_droppable(msg)) + goto exit; + if (msg_errcode(msg)) + goto exit; + + memcpy(&ohdr, msg, msg_hdr_sz(msg)); + imp = min_t(uint, imp + 1, TIPC_CRITICAL_IMPORTANCE); + if (msg_isdata(msg)) + msg_set_importance(msg, imp); + msg_set_errcode(msg, err); + msg_set_origport(msg, msg_destport(&ohdr)); + msg_set_destport(msg, msg_origport(&ohdr)); + msg_set_prevnode(msg, tipc_own_addr); + if (!msg_short(msg)) { + msg_set_orignode(msg, msg_destnode(&ohdr)); + msg_set_destnode(msg, msg_orignode(&ohdr)); + } + msg_set_size(msg, msg_hdr_sz(msg) + rdsz); + skb_trim(buf, msg_size(msg)); + skb_orphan(buf); + *dnode = msg_orignode(&ohdr); + return true; +exit: + kfree_skb(buf); + return false; +} + +/** + * tipc_msg_eval: determine fate of message that found no destination + * @buf: the buffer containing the message. + * @dnode: return value: next-hop node, if message to be forwarded + * @err: error code to use, if message to be rejected + * + * Does not consume buffer + * Returns 0 (TIPC_OK) if message ok and we can try again, -TIPC error + * code if message to be rejected + */ +int tipc_msg_eval(struct sk_buff *buf, u32 *dnode) +{ + struct tipc_msg *msg = buf_msg(buf); + u32 dport; + + if (msg_type(msg) != TIPC_NAMED_MSG) + return -TIPC_ERR_NO_PORT; + if (skb_linearize(buf)) + return -TIPC_ERR_NO_NAME; + if (msg_data_sz(msg) > MAX_FORWARD_SIZE) + return -TIPC_ERR_NO_NAME; + if (msg_reroute_cnt(msg) > 0) + return -TIPC_ERR_NO_NAME; + + *dnode = addr_domain(msg_lookup_scope(msg)); + dport = tipc_nametbl_translate(msg_nametype(msg), + msg_nameinst(msg), + dnode); + if (!dport) + return -TIPC_ERR_NO_NAME; + msg_incr_reroute_cnt(msg); + msg_set_destnode(msg, *dnode); + msg_set_destport(msg, dport); + return TIPC_OK; +} + +/* tipc_msg_reassemble() - clone a buffer chain of fragments and + * reassemble the clones into one message + */ +struct sk_buff *tipc_msg_reassemble(struct sk_buff *chain) +{ + struct sk_buff *buf = chain; + struct sk_buff *frag = buf; + struct sk_buff *head = NULL; + int hdr_sz; + + /* Copy header if single buffer */ + if (!buf->next) { + hdr_sz = skb_headroom(buf) + msg_hdr_sz(buf_msg(buf)); + return __pskb_copy(buf, hdr_sz, GFP_ATOMIC); + } + + /* Clone all fragments and reassemble */ + while (buf) { + frag = skb_clone(buf, GFP_ATOMIC); + if (!frag) + goto error; + frag->next = NULL; + if (tipc_buf_append(&head, &frag)) + break; + if (!head) + goto error; + buf = buf->next; + } + return frag; +error: + pr_warn("Failed do clone local mcast rcv buffer\n"); + kfree_skb(head); + return NULL; +} diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 503511903d1d..0ea7b695ac4d 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -442,6 +442,7 @@ static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 +#define SOCK_WAKEUP 14 /* pseudo user */ /* * Connection management protocol message types @@ -463,6 +464,11 @@ static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) #define FRAGMENT 1 #define LAST_FRAGMENT 2 +/* Bundling protocol message types + */ +#define BUNDLE_OPEN 0 +#define BUNDLE_CLOSED 1 + /* * Link management protocol message types */ @@ -706,12 +712,40 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } -u32 tipc_msg_tot_importance(struct tipc_msg *m); +static inline u32 tipc_msg_tot_importance(struct tipc_msg *m) +{ + if ((msg_user(m) == MSG_FRAGMENTER) && (msg_type(m) == FIRST_FRAGMENT)) + return msg_importance(msg_get_wrapped(m)); + return msg_importance(m); +} + +static inline u32 msg_tot_origport(struct tipc_msg *m) +{ + if ((msg_user(m) == MSG_FRAGMENTER) && (msg_type(m) == FIRST_FRAGMENT)) + return msg_origport(msg_get_wrapped(m)); + return msg_origport(m); +} + +bool tipc_msg_reverse(struct sk_buff *buf, u32 *dnode, int err); + +int tipc_msg_eval(struct sk_buff *buf, u32 *dnode); + void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); -int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect, - unsigned int len, int max_size, struct sk_buff **buf); + +struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, + uint data_sz, u32 dnode, u32 onode, + u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); +bool tipc_msg_bundle(struct sk_buff *bbuf, struct sk_buff *buf, u32 mtu); + +bool tipc_msg_make_bundle(struct sk_buff **buf, u32 mtu, u32 dnode); + +int tipc_msg_build(struct tipc_msg *mhdr, struct iovec const *iov, + int offset, int dsz, int mtu , struct sk_buff **chain); + +struct sk_buff *tipc_msg_reassemble(struct sk_buff *chain); + #endif diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 8ce730984aa1..376d2bb51d8d 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -1,7 +1,7 @@ /* * net/tipc/name_distr.c: TIPC name distribution code * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -71,6 +71,21 @@ static struct publ_list *publ_lists[] = { }; +int sysctl_tipc_named_timeout __read_mostly = 2000; + +/** + * struct tipc_dist_queue - queue holding deferred name table updates + */ +static struct list_head tipc_dist_queue = LIST_HEAD_INIT(tipc_dist_queue); + +struct distr_queue_item { + struct distr_item i; + u32 dtype; + u32 node; + unsigned long expires; + struct list_head next; +}; + /** * publ_to_item - add publication info to a publication message */ @@ -101,24 +116,22 @@ static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest) void named_cluster_distribute(struct sk_buff *buf) { - struct sk_buff *buf_copy; - struct tipc_node *n_ptr; - struct tipc_link *l_ptr; + struct sk_buff *obuf; + struct tipc_node *node; + u32 dnode; rcu_read_lock(); - list_for_each_entry_rcu(n_ptr, &tipc_node_list, list) { - tipc_node_lock(n_ptr); - l_ptr = n_ptr->active_links[n_ptr->addr & 1]; - if (l_ptr) { - buf_copy = skb_copy(buf, GFP_ATOMIC); - if (!buf_copy) { - tipc_node_unlock(n_ptr); - break; - } - msg_set_destnode(buf_msg(buf_copy), n_ptr->addr); - __tipc_link_xmit(l_ptr, buf_copy); - } - tipc_node_unlock(n_ptr); + list_for_each_entry_rcu(node, &tipc_node_list, list) { + dnode = node->addr; + if (in_own_node(dnode)) + continue; + if (!tipc_node_active_links(node)) + continue; + obuf = skb_copy(buf, GFP_ATOMIC); + if (!obuf) + break; + msg_set_destnode(buf_msg(obuf), dnode); + tipc_link_xmit(obuf, dnode, dnode); } rcu_read_unlock(); @@ -175,34 +188,44 @@ struct sk_buff *tipc_named_withdraw(struct publication *publ) return buf; } -/* +/** * named_distribute - prepare name info for bulk distribution to another node + * @msg_list: list of messages (buffers) to be returned from this function + * @dnode: node to be updated + * @pls: linked list of publication items to be packed into buffer chain */ -static void named_distribute(struct list_head *message_list, u32 node, - struct publ_list *pls, u32 max_item_buf) +static void named_distribute(struct list_head *msg_list, u32 dnode, + struct publ_list *pls) { struct publication *publ; struct sk_buff *buf = NULL; struct distr_item *item = NULL; - u32 left = 0; - u32 rest = pls->size * ITEM_SIZE; + uint dsz = pls->size * ITEM_SIZE; + uint msg_dsz = (tipc_node_get_mtu(dnode, 0) / ITEM_SIZE) * ITEM_SIZE; + uint rem = dsz; + uint msg_rem = 0; list_for_each_entry(publ, &pls->list, local_list) { + /* Prepare next buffer: */ if (!buf) { - left = (rest <= max_item_buf) ? rest : max_item_buf; - rest -= left; - buf = named_prepare_buf(PUBLICATION, left, node); + msg_rem = min_t(uint, rem, msg_dsz); + rem -= msg_rem; + buf = named_prepare_buf(PUBLICATION, msg_rem, dnode); if (!buf) { pr_warn("Bulk publication failure\n"); return; } item = (struct distr_item *)msg_data(buf_msg(buf)); } + + /* Pack publication into message: */ publ_to_item(item, publ); item++; - left -= ITEM_SIZE; - if (!left) { - list_add_tail((struct list_head *)buf, message_list); + msg_rem -= ITEM_SIZE; + + /* Append full buffer to list: */ + if (!msg_rem) { + list_add_tail((struct list_head *)buf, msg_list); buf = NULL; } } @@ -211,16 +234,20 @@ static void named_distribute(struct list_head *message_list, u32 node, /** * tipc_named_node_up - tell specified node about all publications by this node */ -void tipc_named_node_up(u32 max_item_buf, u32 node) +void tipc_named_node_up(u32 dnode) { - LIST_HEAD(message_list); + LIST_HEAD(msg_list); + struct sk_buff *buf_chain; read_lock_bh(&tipc_nametbl_lock); - named_distribute(&message_list, node, &publ_cluster, max_item_buf); - named_distribute(&message_list, node, &publ_zone, max_item_buf); + named_distribute(&msg_list, dnode, &publ_cluster); + named_distribute(&msg_list, dnode, &publ_zone); read_unlock_bh(&tipc_nametbl_lock); - tipc_link_names_xmit(&message_list, node); + /* Convert circular list to linear list and send: */ + buf_chain = (struct sk_buff *)msg_list.next; + ((struct sk_buff *)msg_list.prev)->next = NULL; + tipc_link_xmit(buf_chain, dnode, dnode); } /** @@ -251,54 +278,105 @@ static void named_purge_publ(struct publication *publ) } /** + * tipc_update_nametbl - try to process a nametable update and notify + * subscribers + * + * tipc_nametbl_lock must be held. + * Returns the publication item if successful, otherwise NULL. + */ +static bool tipc_update_nametbl(struct distr_item *i, u32 node, u32 dtype) +{ + struct publication *publ = NULL; + + if (dtype == PUBLICATION) { + publ = tipc_nametbl_insert_publ(ntohl(i->type), ntohl(i->lower), + ntohl(i->upper), + TIPC_CLUSTER_SCOPE, node, + ntohl(i->ref), ntohl(i->key)); + if (publ) { + tipc_nodesub_subscribe(&publ->subscr, node, publ, + (net_ev_handler) + named_purge_publ); + return true; + } + } else if (dtype == WITHDRAWAL) { + publ = tipc_nametbl_remove_publ(ntohl(i->type), ntohl(i->lower), + node, ntohl(i->ref), + ntohl(i->key)); + if (publ) { + tipc_nodesub_unsubscribe(&publ->subscr); + kfree(publ); + return true; + } + } else { + pr_warn("Unrecognized name table message received\n"); + } + return false; +} + +/** + * tipc_named_add_backlog - add a failed name table update to the backlog + * + */ +static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node) +{ + struct distr_queue_item *e; + unsigned long now = get_jiffies_64(); + + e = kzalloc(sizeof(*e), GFP_ATOMIC); + if (!e) + return; + e->dtype = type; + e->node = node; + e->expires = now + msecs_to_jiffies(sysctl_tipc_named_timeout); + memcpy(e, i, sizeof(*i)); + list_add_tail(&e->next, &tipc_dist_queue); +} + +/** + * tipc_named_process_backlog - try to process any pending name table updates + * from the network. + */ +void tipc_named_process_backlog(void) +{ + struct distr_queue_item *e, *tmp; + char addr[16]; + unsigned long now = get_jiffies_64(); + + list_for_each_entry_safe(e, tmp, &tipc_dist_queue, next) { + if (time_after(e->expires, now)) { + if (!tipc_update_nametbl(&e->i, e->node, e->dtype)) + continue; + } else { + tipc_addr_string_fill(addr, e->node); + pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %s key=%u\n", + e->dtype, ntohl(e->i.type), + ntohl(e->i.lower), + ntohl(e->i.upper), + addr, ntohl(e->i.key)); + } + list_del(&e->next); + kfree(e); + } +} + +/** * tipc_named_rcv - process name table update message sent by another node */ void tipc_named_rcv(struct sk_buff *buf) { - struct publication *publ; struct tipc_msg *msg = buf_msg(buf); struct distr_item *item = (struct distr_item *)msg_data(msg); u32 count = msg_data_sz(msg) / ITEM_SIZE; + u32 node = msg_orignode(msg); write_lock_bh(&tipc_nametbl_lock); while (count--) { - if (msg_type(msg) == PUBLICATION) { - publ = tipc_nametbl_insert_publ(ntohl(item->type), - ntohl(item->lower), - ntohl(item->upper), - TIPC_CLUSTER_SCOPE, - msg_orignode(msg), - ntohl(item->ref), - ntohl(item->key)); - if (publ) { - tipc_nodesub_subscribe(&publ->subscr, - msg_orignode(msg), - publ, - (net_ev_handler) - named_purge_publ); - } - } else if (msg_type(msg) == WITHDRAWAL) { - publ = tipc_nametbl_remove_publ(ntohl(item->type), - ntohl(item->lower), - msg_orignode(msg), - ntohl(item->ref), - ntohl(item->key)); - - if (publ) { - tipc_nodesub_unsubscribe(&publ->subscr); - kfree(publ); - } else { - pr_err("Unable to remove publication by node 0x%x\n" - " (type=%u, lower=%u, ref=%u, key=%u)\n", - msg_orignode(msg), ntohl(item->type), - ntohl(item->lower), ntohl(item->ref), - ntohl(item->key)); - } - } else { - pr_warn("Unrecognized name table message received\n"); - } + if (!tipc_update_nametbl(item, node, msg_type(msg))) + tipc_named_add_backlog(item, msg_type(msg), node); item++; } + tipc_named_process_backlog(); write_unlock_bh(&tipc_nametbl_lock); kfree_skb(buf); } diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index b2eed4ec1526..b9e75feb3434 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -70,8 +70,9 @@ struct distr_item { struct sk_buff *tipc_named_publish(struct publication *publ); struct sk_buff *tipc_named_withdraw(struct publication *publ); void named_cluster_distribute(struct sk_buff *buf); -void tipc_named_node_up(u32 max_item_buf, u32 node); +void tipc_named_node_up(u32 dnode); void tipc_named_rcv(struct sk_buff *buf); void tipc_named_reinit(void); +void tipc_named_process_backlog(void); #endif diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 9d7d37d95187..3a6a0a7c0759 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -39,7 +39,6 @@ #include "name_table.h" #include "name_distr.h" #include "subscr.h" -#include "port.h" #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ @@ -262,8 +261,6 @@ static struct publication *tipc_nameseq_insert_publ(struct name_seq *nseq, /* Lower end overlaps existing entry => need an exact match */ if ((sseq->lower != lower) || (sseq->upper != upper)) { - pr_warn("Cannot publish {%u,%u,%u}, overlap error\n", - type, lower, upper); return NULL; } @@ -285,8 +282,6 @@ static struct publication *tipc_nameseq_insert_publ(struct name_seq *nseq, /* Fail if upper end overlaps into an existing entry */ if ((inspos < nseq->first_free) && (upper >= nseq->sseqs[inspos].lower)) { - pr_warn("Cannot publish {%u,%u,%u}, overlap error\n", - type, lower, upper); return NULL; } @@ -678,6 +673,8 @@ struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper, if (likely(publ)) { table.local_publ_count++; buf = tipc_named_publish(publ); + /* Any pending external events? */ + tipc_named_process_backlog(); } write_unlock_bh(&tipc_nametbl_lock); @@ -699,6 +696,8 @@ int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key) if (likely(publ)) { table.local_publ_count--; buf = tipc_named_withdraw(publ); + /* Any pending external events? */ + tipc_named_process_backlog(); write_unlock_bh(&tipc_nametbl_lock); list_del_init(&publ->pport_list); kfree(publ); diff --git a/net/tipc/net.c b/net/tipc/net.c index f64375e7f99f..93b9944a6a8b 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -1,7 +1,7 @@ /* * net/tipc/net.c: TIPC network routing code * - * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 1995-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -38,7 +38,6 @@ #include "net.h" #include "name_distr.h" #include "subscr.h" -#include "port.h" #include "socket.h" #include "node.h" #include "config.h" @@ -104,67 +103,6 @@ * - A local spin_lock protecting the queue of subscriber events. */ -static void net_route_named_msg(struct sk_buff *buf) -{ - struct tipc_msg *msg = buf_msg(buf); - u32 dnode; - u32 dport; - - if (!msg_named(msg)) { - kfree_skb(buf); - return; - } - - dnode = addr_domain(msg_lookup_scope(msg)); - dport = tipc_nametbl_translate(msg_nametype(msg), msg_nameinst(msg), &dnode); - if (dport) { - msg_set_destnode(msg, dnode); - msg_set_destport(msg, dport); - tipc_net_route_msg(buf); - return; - } - tipc_reject_msg(buf, TIPC_ERR_NO_NAME); -} - -void tipc_net_route_msg(struct sk_buff *buf) -{ - struct tipc_msg *msg; - u32 dnode; - - if (!buf) - return; - msg = buf_msg(buf); - - /* Handle message for this node */ - dnode = msg_short(msg) ? tipc_own_addr : msg_destnode(msg); - if (tipc_in_scope(dnode, tipc_own_addr)) { - if (msg_isdata(msg)) { - if (msg_mcast(msg)) - tipc_port_mcast_rcv(buf, NULL); - else if (msg_destport(msg)) - tipc_sk_rcv(buf); - else - net_route_named_msg(buf); - return; - } - switch (msg_user(msg)) { - case NAME_DISTRIBUTOR: - tipc_named_rcv(buf); - break; - case CONN_MANAGER: - tipc_port_proto_rcv(buf); - break; - default: - kfree_skb(buf); - } - return; - } - - /* Handle message for another node */ - skb_trim(buf, msg_size(msg)); - tipc_link_xmit(buf, dnode, msg_link_selector(msg)); -} - int tipc_net_start(u32 addr) { char addr_string[16]; @@ -172,7 +110,7 @@ int tipc_net_start(u32 addr) tipc_own_addr = addr; tipc_named_reinit(); - tipc_port_reinit(); + tipc_sk_reinit(); res = tipc_bclink_init(); if (res) return res; diff --git a/net/tipc/net.h b/net/tipc/net.h index c6c2b46f7c28..59ef3388be2c 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -37,8 +37,6 @@ #ifndef _TIPC_NET_H #define _TIPC_NET_H -void tipc_net_route_msg(struct sk_buff *buf); - int tipc_net_start(u32 addr); void tipc_net_stop(void); diff --git a/net/tipc/node.c b/net/tipc/node.c index 5b44c3041be4..90cee4a6fce4 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1,7 +1,7 @@ /* * net/tipc/node.c: TIPC node management routines * - * Copyright (c) 2000-2006, 2012 Ericsson AB + * Copyright (c) 2000-2006, 2012-2014, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * @@ -38,6 +38,7 @@ #include "config.h" #include "node.h" #include "name_distr.h" +#include "socket.h" #define NODE_HTABLE_SIZE 512 @@ -50,6 +51,13 @@ static u32 tipc_num_nodes; static u32 tipc_num_links; static DEFINE_SPINLOCK(node_list_lock); +struct tipc_sock_conn { + u32 port; + u32 peer_port; + u32 peer_node; + struct list_head list; +}; + /* * A trivial power-of-two bitmask technique is used for speed, since this * operation is done for every incoming TIPC packet. The number of hash table @@ -100,6 +108,8 @@ struct tipc_node *tipc_node_create(u32 addr) INIT_HLIST_NODE(&n_ptr->hash); INIT_LIST_HEAD(&n_ptr->list); INIT_LIST_HEAD(&n_ptr->nsub); + INIT_LIST_HEAD(&n_ptr->conn_sks); + __skb_queue_head_init(&n_ptr->waiting_sks); hlist_add_head_rcu(&n_ptr->hash, &node_htable[tipc_hashfn(addr)]); @@ -136,6 +146,71 @@ void tipc_node_stop(void) spin_unlock_bh(&node_list_lock); } +int tipc_node_add_conn(u32 dnode, u32 port, u32 peer_port) +{ + struct tipc_node *node; + struct tipc_sock_conn *conn; + + if (in_own_node(dnode)) + return 0; + + node = tipc_node_find(dnode); + if (!node) { + pr_warn("Connecting sock to node 0x%x failed\n", dnode); + return -EHOSTUNREACH; + } + conn = kmalloc(sizeof(*conn), GFP_ATOMIC); + if (!conn) + return -EHOSTUNREACH; + conn->peer_node = dnode; + conn->port = port; + conn->peer_port = peer_port; + + tipc_node_lock(node); + list_add_tail(&conn->list, &node->conn_sks); + tipc_node_unlock(node); + return 0; +} + +void tipc_node_remove_conn(u32 dnode, u32 port) +{ + struct tipc_node *node; + struct tipc_sock_conn *conn, *safe; + + if (in_own_node(dnode)) + return; + + node = tipc_node_find(dnode); + if (!node) + return; + + tipc_node_lock(node); + list_for_each_entry_safe(conn, safe, &node->conn_sks, list) { + if (port != conn->port) + continue; + list_del(&conn->list); + kfree(conn); + } + tipc_node_unlock(node); +} + +void tipc_node_abort_sock_conns(struct list_head *conns) +{ + struct tipc_sock_conn *conn, *safe; + struct sk_buff *buf; + + list_for_each_entry_safe(conn, safe, conns, list) { + buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, + SHORT_H_SIZE, 0, tipc_own_addr, + conn->peer_node, conn->port, + conn->peer_port, TIPC_ERR_NO_NODE); + if (likely(buf)) + tipc_sk_rcv(buf); + list_del(&conn->list); + kfree(conn); + } +} + /** * tipc_node_link_up - handle addition of link * @@ -155,21 +230,25 @@ void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr) if (!active[0]) { active[0] = active[1] = l_ptr; node_established_contact(n_ptr); - return; + goto exit; } if (l_ptr->priority < active[0]->priority) { pr_info("New link <%s> becomes standby\n", l_ptr->name); - return; + goto exit; } tipc_link_dup_queue_xmit(active[0], l_ptr); if (l_ptr->priority == active[0]->priority) { active[0] = l_ptr; - return; + goto exit; } pr_info("Old link <%s> becomes standby\n", active[0]->name); if (active[1] != active[0]) pr_info("Old link <%s> becomes standby\n", active[1]->name); active[0] = active[1] = l_ptr; +exit: + /* Leave room for changeover header when returning 'mtu' to users: */ + n_ptr->act_mtus[0] = active[0]->max_pkt - INT_H_SIZE; + n_ptr->act_mtus[1] = active[1]->max_pkt - INT_H_SIZE; } /** @@ -229,6 +308,19 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr) tipc_link_failover_send_queue(l_ptr); else node_lost_contact(n_ptr); + + /* Leave room for changeover header when returning 'mtu' to users: */ + if (active[0]) { + n_ptr->act_mtus[0] = active[0]->max_pkt - INT_H_SIZE; + n_ptr->act_mtus[1] = active[1]->max_pkt - INT_H_SIZE; + return; + } + + /* Loopback link went down? No fragmentation needed from now on. */ + if (n_ptr->addr == tipc_own_addr) { + n_ptr->act_mtus[0] = MAX_MSG_SIZE; + n_ptr->act_mtus[1] = MAX_MSG_SIZE; + } } int tipc_node_active_links(struct tipc_node *n_ptr) @@ -457,32 +549,45 @@ int tipc_node_get_linkname(u32 bearer_id, u32 addr, char *linkname, size_t len) void tipc_node_unlock(struct tipc_node *node) { LIST_HEAD(nsub_list); - struct tipc_link *link; - int pkt_sz = 0; + LIST_HEAD(conn_sks); + struct sk_buff_head waiting_sks; u32 addr = 0; + unsigned int flags = node->action_flags; if (likely(!node->action_flags)) { spin_unlock_bh(&node->lock); return; } + __skb_queue_head_init(&waiting_sks); + if (node->action_flags & TIPC_WAKEUP_USERS) { + skb_queue_splice_init(&node->waiting_sks, &waiting_sks); + node->action_flags &= ~TIPC_WAKEUP_USERS; + } if (node->action_flags & TIPC_NOTIFY_NODE_DOWN) { list_replace_init(&node->nsub, &nsub_list); + list_replace_init(&node->conn_sks, &conn_sks); node->action_flags &= ~TIPC_NOTIFY_NODE_DOWN; } if (node->action_flags & TIPC_NOTIFY_NODE_UP) { - link = node->active_links[0]; node->action_flags &= ~TIPC_NOTIFY_NODE_UP; - if (link) { - pkt_sz = ((link->max_pkt - INT_H_SIZE) / ITEM_SIZE) * - ITEM_SIZE; - addr = node->addr; - } + addr = node->addr; } + node->action_flags &= ~TIPC_WAKEUP_BCAST_USERS; spin_unlock_bh(&node->lock); + while (!skb_queue_empty(&waiting_sks)) + tipc_sk_rcv(__skb_dequeue(&waiting_sks)); + + if (!list_empty(&conn_sks)) + tipc_node_abort_sock_conns(&conn_sks); + if (!list_empty(&nsub_list)) tipc_nodesub_notify(&nsub_list); - if (pkt_sz) - tipc_named_node_up(pkt_sz, addr); + + if (flags & TIPC_WAKEUP_BCAST_USERS) + tipc_bclink_wakeup_users(); + + if (addr) + tipc_named_node_up(addr); } diff --git a/net/tipc/node.h b/net/tipc/node.h index 9087063793f2..67513c3c852c 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -41,6 +41,7 @@ #include "addr.h" #include "net.h" #include "bearer.h" +#include "msg.h" /* * Out-of-range value for node signature @@ -57,7 +58,9 @@ enum { TIPC_WAIT_PEER_LINKS_DOWN = (1 << 1), TIPC_WAIT_OWN_LINKS_DOWN = (1 << 2), TIPC_NOTIFY_NODE_DOWN = (1 << 3), - TIPC_NOTIFY_NODE_UP = (1 << 4) + TIPC_NOTIFY_NODE_UP = (1 << 4), + TIPC_WAKEUP_USERS = (1 << 5), + TIPC_WAKEUP_BCAST_USERS = (1 << 6) }; /** @@ -105,6 +108,7 @@ struct tipc_node { spinlock_t lock; struct hlist_node hash; struct tipc_link *active_links[2]; + u32 act_mtus[2]; struct tipc_link *links[MAX_BEARERS]; unsigned int action_flags; struct tipc_node_bclink bclink; @@ -113,6 +117,8 @@ struct tipc_node { int working_links; u32 signature; struct list_head nsub; + struct sk_buff_head waiting_sks; + struct list_head conn_sks; struct rcu_head rcu; }; @@ -131,6 +137,8 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space) struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space); int tipc_node_get_linkname(u32 bearer_id, u32 node, char *linkname, size_t len); void tipc_node_unlock(struct tipc_node *node); +int tipc_node_add_conn(u32 dnode, u32 port, u32 peer_port); +void tipc_node_remove_conn(u32 dnode, u32 port); static inline void tipc_node_lock(struct tipc_node *node) { @@ -143,4 +151,19 @@ static inline bool tipc_node_blocked(struct tipc_node *node) TIPC_NOTIFY_NODE_DOWN | TIPC_WAIT_OWN_LINKS_DOWN)); } +static inline uint tipc_node_get_mtu(u32 addr, u32 selector) +{ + struct tipc_node *node; + u32 mtu; + + node = tipc_node_find(addr); + + if (likely(node)) + mtu = node->act_mtus[selector & 1]; + else + mtu = MAX_MSG_SIZE; + + return mtu; +} + #endif diff --git a/net/tipc/node_subscr.c b/net/tipc/node_subscr.c index 7c59ab1d6ecb..2d13eea8574a 100644 --- a/net/tipc/node_subscr.c +++ b/net/tipc/node_subscr.c @@ -84,11 +84,13 @@ void tipc_nodesub_unsubscribe(struct tipc_node_subscr *node_sub) void tipc_nodesub_notify(struct list_head *nsub_list) { struct tipc_node_subscr *ns, *safe; + net_ev_handler handle_node_down; list_for_each_entry_safe(ns, safe, nsub_list, nodesub_list) { - if (ns->handle_node_down) { - ns->handle_node_down(ns->usr_handle); + handle_node_down = ns->handle_node_down; + if (handle_node_down) { ns->handle_node_down = NULL; + handle_node_down(ns->usr_handle); } } } diff --git a/net/tipc/port.c b/net/tipc/port.c deleted file mode 100644 index 5fd7acce01ea..000000000000 --- a/net/tipc/port.c +++ /dev/null @@ -1,898 +0,0 @@ -/* - * net/tipc/port.c: TIPC port code - * - * Copyright (c) 1992-2007, 2014, Ericsson AB - * Copyright (c) 2004-2008, 2010-2013, Wind River Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "core.h" -#include "config.h" -#include "port.h" -#include "name_table.h" -#include "socket.h" - -/* Connection management: */ -#define PROBING_INTERVAL 3600000 /* [ms] => 1 h */ -#define CONFIRMED 0 -#define PROBING 1 - -#define MAX_REJECT_SIZE 1024 - -DEFINE_SPINLOCK(tipc_port_list_lock); - -static LIST_HEAD(ports); -static void port_handle_node_down(unsigned long ref); -static struct sk_buff *port_build_self_abort_msg(struct tipc_port *, u32 err); -static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *, u32 err); -static void port_timeout(unsigned long ref); - -/** - * tipc_port_peer_msg - verify message was sent by connected port's peer - * - * Handles cases where the node's network address has changed from - * the default of <0.0.0> to its configured setting. - */ -int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg) -{ - u32 peernode; - u32 orignode; - - if (msg_origport(msg) != tipc_port_peerport(p_ptr)) - return 0; - - orignode = msg_orignode(msg); - peernode = tipc_port_peernode(p_ptr); - return (orignode == peernode) || - (!orignode && (peernode == tipc_own_addr)) || - (!peernode && (orignode == tipc_own_addr)); -} - -/** - * tipc_port_mcast_xmit - send a multicast message to local and remote - * destinations - */ -int tipc_port_mcast_xmit(struct tipc_port *oport, - struct tipc_name_seq const *seq, - struct iovec const *msg_sect, - unsigned int len) -{ - struct tipc_msg *hdr; - struct sk_buff *buf; - struct sk_buff *ibuf = NULL; - struct tipc_port_list dports = {0, NULL, }; - int ext_targets; - int res; - - /* Create multicast message */ - hdr = &oport->phdr; - msg_set_type(hdr, TIPC_MCAST_MSG); - msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE); - msg_set_destport(hdr, 0); - msg_set_destnode(hdr, 0); - msg_set_nametype(hdr, seq->type); - msg_set_namelower(hdr, seq->lower); - msg_set_nameupper(hdr, seq->upper); - msg_set_hdr_sz(hdr, MCAST_H_SIZE); - res = tipc_msg_build(hdr, msg_sect, len, MAX_MSG_SIZE, &buf); - if (unlikely(!buf)) - return res; - - /* Figure out where to send multicast message */ - ext_targets = tipc_nametbl_mc_translate(seq->type, seq->lower, seq->upper, - TIPC_NODE_SCOPE, &dports); - - /* Send message to destinations (duplicate it only if necessary) */ - if (ext_targets) { - if (dports.count != 0) { - ibuf = skb_copy(buf, GFP_ATOMIC); - if (ibuf == NULL) { - tipc_port_list_free(&dports); - kfree_skb(buf); - return -ENOMEM; - } - } - res = tipc_bclink_xmit(buf); - if ((res < 0) && (dports.count != 0)) - kfree_skb(ibuf); - } else { - ibuf = buf; - } - - if (res >= 0) { - if (ibuf) - tipc_port_mcast_rcv(ibuf, &dports); - } else { - tipc_port_list_free(&dports); - } - return res; -} - -/** - * tipc_port_mcast_rcv - deliver multicast message to all destination ports - * - * If there is no port list, perform a lookup to create one - */ -void tipc_port_mcast_rcv(struct sk_buff *buf, struct tipc_port_list *dp) -{ - struct tipc_msg *msg; - struct tipc_port_list dports = {0, NULL, }; - struct tipc_port_list *item = dp; - int cnt = 0; - - msg = buf_msg(buf); - - /* Create destination port list, if one wasn't supplied */ - if (dp == NULL) { - tipc_nametbl_mc_translate(msg_nametype(msg), - msg_namelower(msg), - msg_nameupper(msg), - TIPC_CLUSTER_SCOPE, - &dports); - item = dp = &dports; - } - - /* Deliver a copy of message to each destination port */ - if (dp->count != 0) { - msg_set_destnode(msg, tipc_own_addr); - if (dp->count == 1) { - msg_set_destport(msg, dp->ports[0]); - tipc_sk_rcv(buf); - tipc_port_list_free(dp); - return; - } - for (; cnt < dp->count; cnt++) { - int index = cnt % PLSIZE; - struct sk_buff *b = skb_clone(buf, GFP_ATOMIC); - - if (b == NULL) { - pr_warn("Unable to deliver multicast message(s)\n"); - goto exit; - } - if ((index == 0) && (cnt != 0)) - item = item->next; - msg_set_destport(buf_msg(b), item->ports[index]); - tipc_sk_rcv(b); - } - } -exit: - kfree_skb(buf); - tipc_port_list_free(dp); -} - - -void tipc_port_wakeup(struct tipc_port *port) -{ - tipc_sock_wakeup(tipc_port_to_sock(port)); -} - -/* tipc_port_init - intiate TIPC port and lock it - * - * Returns obtained reference if initialization is successful, zero otherwise - */ -u32 tipc_port_init(struct tipc_port *p_ptr, - const unsigned int importance) -{ - struct tipc_msg *msg; - u32 ref; - - ref = tipc_ref_acquire(p_ptr, &p_ptr->lock); - if (!ref) { - pr_warn("Port registration failed, ref. table exhausted\n"); - return 0; - } - - p_ptr->max_pkt = MAX_PKT_DEFAULT; - p_ptr->ref = ref; - INIT_LIST_HEAD(&p_ptr->wait_list); - INIT_LIST_HEAD(&p_ptr->subscription.nodesub_list); - k_init_timer(&p_ptr->timer, (Handler)port_timeout, ref); - INIT_LIST_HEAD(&p_ptr->publications); - INIT_LIST_HEAD(&p_ptr->port_list); - - /* - * Must hold port list lock while initializing message header template - * to ensure a change to node's own network address doesn't result - * in template containing out-dated network address information - */ - spin_lock_bh(&tipc_port_list_lock); - msg = &p_ptr->phdr; - tipc_msg_init(msg, importance, TIPC_NAMED_MSG, NAMED_H_SIZE, 0); - msg_set_origport(msg, ref); - list_add_tail(&p_ptr->port_list, &ports); - spin_unlock_bh(&tipc_port_list_lock); - return ref; -} - -void tipc_port_destroy(struct tipc_port *p_ptr) -{ - struct sk_buff *buf = NULL; - - tipc_withdraw(p_ptr, 0, NULL); - - spin_lock_bh(p_ptr->lock); - tipc_ref_discard(p_ptr->ref); - spin_unlock_bh(p_ptr->lock); - - k_cancel_timer(&p_ptr->timer); - if (p_ptr->connected) { - buf = port_build_peer_abort_msg(p_ptr, TIPC_ERR_NO_PORT); - tipc_nodesub_unsubscribe(&p_ptr->subscription); - } - - spin_lock_bh(&tipc_port_list_lock); - list_del(&p_ptr->port_list); - list_del(&p_ptr->wait_list); - spin_unlock_bh(&tipc_port_list_lock); - k_term_timer(&p_ptr->timer); - tipc_net_route_msg(buf); -} - -/* - * port_build_proto_msg(): create connection protocol message for port - * - * On entry the port must be locked and connected. - */ -static struct sk_buff *port_build_proto_msg(struct tipc_port *p_ptr, - u32 type, u32 ack) -{ - struct sk_buff *buf; - struct tipc_msg *msg; - - buf = tipc_buf_acquire(INT_H_SIZE); - if (buf) { - msg = buf_msg(buf); - tipc_msg_init(msg, CONN_MANAGER, type, INT_H_SIZE, - tipc_port_peernode(p_ptr)); - msg_set_destport(msg, tipc_port_peerport(p_ptr)); - msg_set_origport(msg, p_ptr->ref); - msg_set_msgcnt(msg, ack); - } - return buf; -} - -int tipc_reject_msg(struct sk_buff *buf, u32 err) -{ - struct tipc_msg *msg = buf_msg(buf); - struct sk_buff *rbuf; - struct tipc_msg *rmsg; - int hdr_sz; - u32 imp; - u32 data_sz = msg_data_sz(msg); - u32 src_node; - u32 rmsg_sz; - - /* discard rejected message if it shouldn't be returned to sender */ - if (WARN(!msg_isdata(msg), - "attempt to reject message with user=%u", msg_user(msg))) { - dump_stack(); - goto exit; - } - if (msg_errcode(msg) || msg_dest_droppable(msg)) - goto exit; - - /* - * construct returned message by copying rejected message header and - * data (or subset), then updating header fields that need adjusting - */ - hdr_sz = msg_hdr_sz(msg); - rmsg_sz = hdr_sz + min_t(u32, data_sz, MAX_REJECT_SIZE); - - rbuf = tipc_buf_acquire(rmsg_sz); - if (rbuf == NULL) - goto exit; - - rmsg = buf_msg(rbuf); - skb_copy_to_linear_data(rbuf, msg, rmsg_sz); - - if (msg_connected(rmsg)) { - imp = msg_importance(rmsg); - if (imp < TIPC_CRITICAL_IMPORTANCE) - msg_set_importance(rmsg, ++imp); - } - msg_set_non_seq(rmsg, 0); - msg_set_size(rmsg, rmsg_sz); - msg_set_errcode(rmsg, err); - msg_set_prevnode(rmsg, tipc_own_addr); - msg_swap_words(rmsg, 4, 5); - if (!msg_short(rmsg)) - msg_swap_words(rmsg, 6, 7); - - /* send self-abort message when rejecting on a connected port */ - if (msg_connected(msg)) { - struct tipc_port *p_ptr = tipc_port_lock(msg_destport(msg)); - - if (p_ptr) { - struct sk_buff *abuf = NULL; - - if (p_ptr->connected) - abuf = port_build_self_abort_msg(p_ptr, err); - tipc_port_unlock(p_ptr); - tipc_net_route_msg(abuf); - } - } - - /* send returned message & dispose of rejected message */ - src_node = msg_prevnode(msg); - if (in_own_node(src_node)) - tipc_sk_rcv(rbuf); - else - tipc_link_xmit(rbuf, src_node, msg_link_selector(rmsg)); -exit: - kfree_skb(buf); - return data_sz; -} - -int tipc_port_iovec_reject(struct tipc_port *p_ptr, struct tipc_msg *hdr, - struct iovec const *msg_sect, unsigned int len, - int err) -{ - struct sk_buff *buf; - int res; - - res = tipc_msg_build(hdr, msg_sect, len, MAX_MSG_SIZE, &buf); - if (!buf) - return res; - - return tipc_reject_msg(buf, err); -} - -static void port_timeout(unsigned long ref) -{ - struct tipc_port *p_ptr = tipc_port_lock(ref); - struct sk_buff *buf = NULL; - - if (!p_ptr) - return; - - if (!p_ptr->connected) { - tipc_port_unlock(p_ptr); - return; - } - - /* Last probe answered ? */ - if (p_ptr->probing_state == PROBING) { - buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_PORT); - } else { - buf = port_build_proto_msg(p_ptr, CONN_PROBE, 0); - p_ptr->probing_state = PROBING; - k_start_timer(&p_ptr->timer, p_ptr->probing_interval); - } - tipc_port_unlock(p_ptr); - tipc_net_route_msg(buf); -} - - -static void port_handle_node_down(unsigned long ref) -{ - struct tipc_port *p_ptr = tipc_port_lock(ref); - struct sk_buff *buf = NULL; - - if (!p_ptr) - return; - buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_NODE); - tipc_port_unlock(p_ptr); - tipc_net_route_msg(buf); -} - - -static struct sk_buff *port_build_self_abort_msg(struct tipc_port *p_ptr, u32 err) -{ - struct sk_buff *buf = port_build_peer_abort_msg(p_ptr, err); - - if (buf) { - struct tipc_msg *msg = buf_msg(buf); - msg_swap_words(msg, 4, 5); - msg_swap_words(msg, 6, 7); - } - return buf; -} - - -static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *p_ptr, u32 err) -{ - struct sk_buff *buf; - struct tipc_msg *msg; - u32 imp; - - if (!p_ptr->connected) - return NULL; - - buf = tipc_buf_acquire(BASIC_H_SIZE); - if (buf) { - msg = buf_msg(buf); - memcpy(msg, &p_ptr->phdr, BASIC_H_SIZE); - msg_set_hdr_sz(msg, BASIC_H_SIZE); - msg_set_size(msg, BASIC_H_SIZE); - imp = msg_importance(msg); - if (imp < TIPC_CRITICAL_IMPORTANCE) - msg_set_importance(msg, ++imp); - msg_set_errcode(msg, err); - } - return buf; -} - -void tipc_port_proto_rcv(struct sk_buff *buf) -{ - struct tipc_msg *msg = buf_msg(buf); - struct tipc_port *p_ptr; - struct sk_buff *r_buf = NULL; - u32 destport = msg_destport(msg); - int wakeable; - - /* Validate connection */ - p_ptr = tipc_port_lock(destport); - if (!p_ptr || !p_ptr->connected || !tipc_port_peer_msg(p_ptr, msg)) { - r_buf = tipc_buf_acquire(BASIC_H_SIZE); - if (r_buf) { - msg = buf_msg(r_buf); - tipc_msg_init(msg, TIPC_HIGH_IMPORTANCE, TIPC_CONN_MSG, - BASIC_H_SIZE, msg_orignode(msg)); - msg_set_errcode(msg, TIPC_ERR_NO_PORT); - msg_set_origport(msg, destport); - msg_set_destport(msg, msg_origport(msg)); - } - if (p_ptr) - tipc_port_unlock(p_ptr); - goto exit; - } - - /* Process protocol message sent by peer */ - switch (msg_type(msg)) { - case CONN_ACK: - wakeable = tipc_port_congested(p_ptr) && p_ptr->congested; - p_ptr->acked += msg_msgcnt(msg); - if (!tipc_port_congested(p_ptr)) { - p_ptr->congested = 0; - if (wakeable) - tipc_port_wakeup(p_ptr); - } - break; - case CONN_PROBE: - r_buf = port_build_proto_msg(p_ptr, CONN_PROBE_REPLY, 0); - break; - default: - /* CONN_PROBE_REPLY or unrecognized - no action required */ - break; - } - p_ptr->probing_state = CONFIRMED; - tipc_port_unlock(p_ptr); -exit: - tipc_net_route_msg(r_buf); - kfree_skb(buf); -} - -static int port_print(struct tipc_port *p_ptr, char *buf, int len, int full_id) -{ - struct publication *publ; - int ret; - - if (full_id) - ret = tipc_snprintf(buf, len, "<%u.%u.%u:%u>:", - tipc_zone(tipc_own_addr), - tipc_cluster(tipc_own_addr), - tipc_node(tipc_own_addr), p_ptr->ref); - else - ret = tipc_snprintf(buf, len, "%-10u:", p_ptr->ref); - - if (p_ptr->connected) { - u32 dport = tipc_port_peerport(p_ptr); - u32 destnode = tipc_port_peernode(p_ptr); - - ret += tipc_snprintf(buf + ret, len - ret, - " connected to <%u.%u.%u:%u>", - tipc_zone(destnode), - tipc_cluster(destnode), - tipc_node(destnode), dport); - if (p_ptr->conn_type != 0) - ret += tipc_snprintf(buf + ret, len - ret, - " via {%u,%u}", p_ptr->conn_type, - p_ptr->conn_instance); - } else if (p_ptr->published) { - ret += tipc_snprintf(buf + ret, len - ret, " bound to"); - list_for_each_entry(publ, &p_ptr->publications, pport_list) { - if (publ->lower == publ->upper) - ret += tipc_snprintf(buf + ret, len - ret, - " {%u,%u}", publ->type, - publ->lower); - else - ret += tipc_snprintf(buf + ret, len - ret, - " {%u,%u,%u}", publ->type, - publ->lower, publ->upper); - } - } - ret += tipc_snprintf(buf + ret, len - ret, "\n"); - return ret; -} - -struct sk_buff *tipc_port_get_ports(void) -{ - struct sk_buff *buf; - struct tlv_desc *rep_tlv; - char *pb; - int pb_len; - struct tipc_port *p_ptr; - int str_len = 0; - - buf = tipc_cfg_reply_alloc(TLV_SPACE(ULTRA_STRING_MAX_LEN)); - if (!buf) - return NULL; - rep_tlv = (struct tlv_desc *)buf->data; - pb = TLV_DATA(rep_tlv); - pb_len = ULTRA_STRING_MAX_LEN; - - spin_lock_bh(&tipc_port_list_lock); - list_for_each_entry(p_ptr, &ports, port_list) { - spin_lock_bh(p_ptr->lock); - str_len += port_print(p_ptr, pb, pb_len, 0); - spin_unlock_bh(p_ptr->lock); - } - spin_unlock_bh(&tipc_port_list_lock); - str_len += 1; /* for "\0" */ - skb_put(buf, TLV_SPACE(str_len)); - TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); - - return buf; -} - -void tipc_port_reinit(void) -{ - struct tipc_port *p_ptr; - struct tipc_msg *msg; - - spin_lock_bh(&tipc_port_list_lock); - list_for_each_entry(p_ptr, &ports, port_list) { - msg = &p_ptr->phdr; - msg_set_prevnode(msg, tipc_own_addr); - msg_set_orignode(msg, tipc_own_addr); - } - spin_unlock_bh(&tipc_port_list_lock); -} - -void tipc_acknowledge(u32 ref, u32 ack) -{ - struct tipc_port *p_ptr; - struct sk_buff *buf = NULL; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return; - if (p_ptr->connected) { - p_ptr->conn_unacked -= ack; - buf = port_build_proto_msg(p_ptr, CONN_ACK, ack); - } - tipc_port_unlock(p_ptr); - tipc_net_route_msg(buf); -} - -int tipc_publish(struct tipc_port *p_ptr, unsigned int scope, - struct tipc_name_seq const *seq) -{ - struct publication *publ; - u32 key; - - if (p_ptr->connected) - return -EINVAL; - key = p_ptr->ref + p_ptr->pub_count + 1; - if (key == p_ptr->ref) - return -EADDRINUSE; - - publ = tipc_nametbl_publish(seq->type, seq->lower, seq->upper, - scope, p_ptr->ref, key); - if (publ) { - list_add(&publ->pport_list, &p_ptr->publications); - p_ptr->pub_count++; - p_ptr->published = 1; - return 0; - } - return -EINVAL; -} - -int tipc_withdraw(struct tipc_port *p_ptr, unsigned int scope, - struct tipc_name_seq const *seq) -{ - struct publication *publ; - struct publication *tpubl; - int res = -EINVAL; - - if (!seq) { - list_for_each_entry_safe(publ, tpubl, - &p_ptr->publications, pport_list) { - tipc_nametbl_withdraw(publ->type, publ->lower, - publ->ref, publ->key); - } - res = 0; - } else { - list_for_each_entry_safe(publ, tpubl, - &p_ptr->publications, pport_list) { - if (publ->scope != scope) - continue; - if (publ->type != seq->type) - continue; - if (publ->lower != seq->lower) - continue; - if (publ->upper != seq->upper) - break; - tipc_nametbl_withdraw(publ->type, publ->lower, - publ->ref, publ->key); - res = 0; - break; - } - } - if (list_empty(&p_ptr->publications)) - p_ptr->published = 0; - return res; -} - -int tipc_port_connect(u32 ref, struct tipc_portid const *peer) -{ - struct tipc_port *p_ptr; - int res; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - res = __tipc_port_connect(ref, p_ptr, peer); - tipc_port_unlock(p_ptr); - return res; -} - -/* - * __tipc_port_connect - connect to a remote peer - * - * Port must be locked. - */ -int __tipc_port_connect(u32 ref, struct tipc_port *p_ptr, - struct tipc_portid const *peer) -{ - struct tipc_msg *msg; - int res = -EINVAL; - - if (p_ptr->published || p_ptr->connected) - goto exit; - if (!peer->ref) - goto exit; - - msg = &p_ptr->phdr; - msg_set_destnode(msg, peer->node); - msg_set_destport(msg, peer->ref); - msg_set_type(msg, TIPC_CONN_MSG); - msg_set_lookup_scope(msg, 0); - msg_set_hdr_sz(msg, SHORT_H_SIZE); - - p_ptr->probing_interval = PROBING_INTERVAL; - p_ptr->probing_state = CONFIRMED; - p_ptr->connected = 1; - k_start_timer(&p_ptr->timer, p_ptr->probing_interval); - - tipc_nodesub_subscribe(&p_ptr->subscription, peer->node, - (void *)(unsigned long)ref, - (net_ev_handler)port_handle_node_down); - res = 0; -exit: - p_ptr->max_pkt = tipc_link_get_max_pkt(peer->node, ref); - return res; -} - -/* - * __tipc_disconnect - disconnect port from peer - * - * Port must be locked. - */ -int __tipc_port_disconnect(struct tipc_port *tp_ptr) -{ - if (tp_ptr->connected) { - tp_ptr->connected = 0; - /* let timer expire on it's own to avoid deadlock! */ - tipc_nodesub_unsubscribe(&tp_ptr->subscription); - return 0; - } - - return -ENOTCONN; -} - -/* - * tipc_port_disconnect(): Disconnect port form peer. - * This is a node local operation. - */ -int tipc_port_disconnect(u32 ref) -{ - struct tipc_port *p_ptr; - int res; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - res = __tipc_port_disconnect(p_ptr); - tipc_port_unlock(p_ptr); - return res; -} - -/* - * tipc_port_shutdown(): Send a SHUTDOWN msg to peer and disconnect - */ -int tipc_port_shutdown(u32 ref) -{ - struct tipc_port *p_ptr; - struct sk_buff *buf = NULL; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - - buf = port_build_peer_abort_msg(p_ptr, TIPC_CONN_SHUTDOWN); - tipc_port_unlock(p_ptr); - tipc_net_route_msg(buf); - return tipc_port_disconnect(ref); -} - -/* - * tipc_port_iovec_rcv: Concatenate and deliver sectioned - * message for this node. - */ -static int tipc_port_iovec_rcv(struct tipc_port *sender, - struct iovec const *msg_sect, - unsigned int len) -{ - struct sk_buff *buf; - int res; - - res = tipc_msg_build(&sender->phdr, msg_sect, len, MAX_MSG_SIZE, &buf); - if (likely(buf)) - tipc_sk_rcv(buf); - return res; -} - -/** - * tipc_send - send message sections on connection - */ -int tipc_send(struct tipc_port *p_ptr, - struct iovec const *msg_sect, - unsigned int len) -{ - u32 destnode; - int res; - - if (!p_ptr->connected) - return -EINVAL; - - p_ptr->congested = 1; - if (!tipc_port_congested(p_ptr)) { - destnode = tipc_port_peernode(p_ptr); - if (likely(!in_own_node(destnode))) - res = tipc_link_iovec_xmit_fast(p_ptr, msg_sect, len, - destnode); - else - res = tipc_port_iovec_rcv(p_ptr, msg_sect, len); - - if (likely(res != -ELINKCONG)) { - p_ptr->congested = 0; - if (res > 0) - p_ptr->sent++; - return res; - } - } - if (tipc_port_unreliable(p_ptr)) { - p_ptr->congested = 0; - return len; - } - return -ELINKCONG; -} - -/** - * tipc_send2name - send message sections to port name - */ -int tipc_send2name(struct tipc_port *p_ptr, - struct tipc_name const *name, - unsigned int domain, - struct iovec const *msg_sect, - unsigned int len) -{ - struct tipc_msg *msg; - u32 destnode = domain; - u32 destport; - int res; - - if (p_ptr->connected) - return -EINVAL; - - msg = &p_ptr->phdr; - msg_set_type(msg, TIPC_NAMED_MSG); - msg_set_hdr_sz(msg, NAMED_H_SIZE); - msg_set_nametype(msg, name->type); - msg_set_nameinst(msg, name->instance); - msg_set_lookup_scope(msg, tipc_addr_scope(domain)); - destport = tipc_nametbl_translate(name->type, name->instance, &destnode); - msg_set_destnode(msg, destnode); - msg_set_destport(msg, destport); - - if (likely(destport || destnode)) { - if (likely(in_own_node(destnode))) - res = tipc_port_iovec_rcv(p_ptr, msg_sect, len); - else if (tipc_own_addr) - res = tipc_link_iovec_xmit_fast(p_ptr, msg_sect, len, - destnode); - else - res = tipc_port_iovec_reject(p_ptr, msg, msg_sect, - len, TIPC_ERR_NO_NODE); - if (likely(res != -ELINKCONG)) { - if (res > 0) - p_ptr->sent++; - return res; - } - if (tipc_port_unreliable(p_ptr)) - return len; - - return -ELINKCONG; - } - return tipc_port_iovec_reject(p_ptr, msg, msg_sect, len, - TIPC_ERR_NO_NAME); -} - -/** - * tipc_send2port - send message sections to port identity - */ -int tipc_send2port(struct tipc_port *p_ptr, - struct tipc_portid const *dest, - struct iovec const *msg_sect, - unsigned int len) -{ - struct tipc_msg *msg; - int res; - - if (p_ptr->connected) - return -EINVAL; - - msg = &p_ptr->phdr; - msg_set_type(msg, TIPC_DIRECT_MSG); - msg_set_lookup_scope(msg, 0); - msg_set_destnode(msg, dest->node); - msg_set_destport(msg, dest->ref); - msg_set_hdr_sz(msg, BASIC_H_SIZE); - - if (in_own_node(dest->node)) - res = tipc_port_iovec_rcv(p_ptr, msg_sect, len); - else if (tipc_own_addr) - res = tipc_link_iovec_xmit_fast(p_ptr, msg_sect, len, - dest->node); - else - res = tipc_port_iovec_reject(p_ptr, msg, msg_sect, len, - TIPC_ERR_NO_NODE); - if (likely(res != -ELINKCONG)) { - if (res > 0) - p_ptr->sent++; - return res; - } - if (tipc_port_unreliable(p_ptr)) - return len; - - return -ELINKCONG; -} diff --git a/net/tipc/port.h b/net/tipc/port.h deleted file mode 100644 index cf4ca5b1d9a4..000000000000 --- a/net/tipc/port.h +++ /dev/null @@ -1,237 +0,0 @@ -/* - * net/tipc/port.h: Include file for TIPC port code - * - * Copyright (c) 1994-2007, 2014, Ericsson AB - * Copyright (c) 2004-2007, 2010-2013, Wind River Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _TIPC_PORT_H -#define _TIPC_PORT_H - -#include "ref.h" -#include "net.h" -#include "msg.h" -#include "node_subscr.h" - -#define TIPC_CONNACK_INTV 256 -#define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2) -#define TIPC_CONN_OVERLOAD_LIMIT ((TIPC_FLOWCTRL_WIN * 2 + 1) * \ - SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) - -/** - * struct tipc_port - TIPC port structure - * @lock: pointer to spinlock for controlling access to port - * @connected: non-zero if port is currently connected to a peer port - * @conn_type: TIPC type used when connection was established - * @conn_instance: TIPC instance used when connection was established - * @conn_unacked: number of unacknowledged messages received from peer port - * @published: non-zero if port has one or more associated names - * @congested: non-zero if cannot send because of link or port congestion - * @max_pkt: maximum packet size "hint" used when building messages sent by port - * @ref: unique reference to port in TIPC object registry - * @phdr: preformatted message header used when sending messages - * @port_list: adjacent ports in TIPC's global list of ports - * @wait_list: adjacent ports in list of ports waiting on link congestion - * @waiting_pkts: - * @sent: # of non-empty messages sent by port - * @acked: # of non-empty message acknowledgements from connected port's peer - * @publications: list of publications for port - * @pub_count: total # of publications port has made during its lifetime - * @probing_state: - * @probing_interval: - * @timer_ref: - * @subscription: "node down" subscription used to terminate failed connections - */ -struct tipc_port { - spinlock_t *lock; - int connected; - u32 conn_type; - u32 conn_instance; - u32 conn_unacked; - int published; - u32 congested; - u32 max_pkt; - u32 ref; - struct tipc_msg phdr; - struct list_head port_list; - struct list_head wait_list; - u32 waiting_pkts; - u32 sent; - u32 acked; - struct list_head publications; - u32 pub_count; - u32 probing_state; - u32 probing_interval; - struct timer_list timer; - struct tipc_node_subscr subscription; -}; - -extern spinlock_t tipc_port_list_lock; -struct tipc_port_list; - -/* - * TIPC port manipulation routines - */ -u32 tipc_port_init(struct tipc_port *p_ptr, - const unsigned int importance); - -int tipc_reject_msg(struct sk_buff *buf, u32 err); - -void tipc_acknowledge(u32 port_ref, u32 ack); - -void tipc_port_destroy(struct tipc_port *p_ptr); - -int tipc_publish(struct tipc_port *p_ptr, unsigned int scope, - struct tipc_name_seq const *name_seq); - -int tipc_withdraw(struct tipc_port *p_ptr, unsigned int scope, - struct tipc_name_seq const *name_seq); - -int tipc_port_connect(u32 portref, struct tipc_portid const *port); - -int tipc_port_disconnect(u32 portref); - -int tipc_port_shutdown(u32 ref); - -void tipc_port_wakeup(struct tipc_port *port); - -/* - * The following routines require that the port be locked on entry - */ -int __tipc_port_disconnect(struct tipc_port *tp_ptr); -int __tipc_port_connect(u32 ref, struct tipc_port *p_ptr, - struct tipc_portid const *peer); -int tipc_port_peer_msg(struct tipc_port *p_ptr, struct tipc_msg *msg); - -/* - * TIPC messaging routines - */ - -int tipc_send(struct tipc_port *port, - struct iovec const *msg_sect, - unsigned int len); - -int tipc_send2name(struct tipc_port *port, - struct tipc_name const *name, - u32 domain, - struct iovec const *msg_sect, - unsigned int len); - -int tipc_send2port(struct tipc_port *port, - struct tipc_portid const *dest, - struct iovec const *msg_sect, - unsigned int len); - -int tipc_port_mcast_xmit(struct tipc_port *port, - struct tipc_name_seq const *seq, - struct iovec const *msg, - unsigned int len); - -int tipc_port_iovec_reject(struct tipc_port *p_ptr, - struct tipc_msg *hdr, - struct iovec const *msg_sect, - unsigned int len, - int err); - -struct sk_buff *tipc_port_get_ports(void); -void tipc_port_proto_rcv(struct sk_buff *buf); -void tipc_port_mcast_rcv(struct sk_buff *buf, struct tipc_port_list *dp); -void tipc_port_reinit(void); - -/** - * tipc_port_lock - lock port instance referred to and return its pointer - */ -static inline struct tipc_port *tipc_port_lock(u32 ref) -{ - return (struct tipc_port *)tipc_ref_lock(ref); -} - -/** - * tipc_port_unlock - unlock a port instance - * - * Can use pointer instead of tipc_ref_unlock() since port is already locked. - */ -static inline void tipc_port_unlock(struct tipc_port *p_ptr) -{ - spin_unlock_bh(p_ptr->lock); -} - -static inline int tipc_port_congested(struct tipc_port *p_ptr) -{ - return ((p_ptr->sent - p_ptr->acked) >= TIPC_FLOWCTRL_WIN); -} - - -static inline u32 tipc_port_peernode(struct tipc_port *p_ptr) -{ - return msg_destnode(&p_ptr->phdr); -} - -static inline u32 tipc_port_peerport(struct tipc_port *p_ptr) -{ - return msg_destport(&p_ptr->phdr); -} - -static inline bool tipc_port_unreliable(struct tipc_port *port) -{ - return msg_src_droppable(&port->phdr) != 0; -} - -static inline void tipc_port_set_unreliable(struct tipc_port *port, - bool unreliable) -{ - msg_set_src_droppable(&port->phdr, unreliable ? 1 : 0); -} - -static inline bool tipc_port_unreturnable(struct tipc_port *port) -{ - return msg_dest_droppable(&port->phdr) != 0; -} - -static inline void tipc_port_set_unreturnable(struct tipc_port *port, - bool unreturnable) -{ - msg_set_dest_droppable(&port->phdr, unreturnable ? 1 : 0); -} - - -static inline int tipc_port_importance(struct tipc_port *port) -{ - return msg_importance(&port->phdr); -} - -static inline void tipc_port_set_importance(struct tipc_port *port, int imp) -{ - msg_set_importance(&port->phdr, (u32)imp); -} - -#endif diff --git a/net/tipc/ref.c b/net/tipc/ref.c deleted file mode 100644 index 3d4ecd754eee..000000000000 --- a/net/tipc/ref.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * net/tipc/ref.c: TIPC object registry code - * - * Copyright (c) 1991-2006, Ericsson AB - * Copyright (c) 2004-2007, Wind River Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "core.h" -#include "ref.h" - -/** - * struct reference - TIPC object reference entry - * @object: pointer to object associated with reference entry - * @lock: spinlock controlling access to object - * @ref: reference value for object (combines instance & array index info) - */ -struct reference { - void *object; - spinlock_t lock; - u32 ref; -}; - -/** - * struct tipc_ref_table - table of TIPC object reference entries - * @entries: pointer to array of reference entries - * @capacity: array index of first unusable entry - * @init_point: array index of first uninitialized entry - * @first_free: array index of first unused object reference entry - * @last_free: array index of last unused object reference entry - * @index_mask: bitmask for array index portion of reference values - * @start_mask: initial value for instance value portion of reference values - */ -struct ref_table { - struct reference *entries; - u32 capacity; - u32 init_point; - u32 first_free; - u32 last_free; - u32 index_mask; - u32 start_mask; -}; - -/* - * Object reference table consists of 2**N entries. - * - * State Object ptr Reference - * ----- ---------- --------- - * In use non-NULL XXXX|own index - * (XXXX changes each time entry is acquired) - * Free NULL YYYY|next free index - * (YYYY is one more than last used XXXX) - * Uninitialized NULL 0 - * - * Entry 0 is not used; this allows index 0 to denote the end of the free list. - * - * Note that a reference value of 0 does not necessarily indicate that an - * entry is uninitialized, since the last entry in the free list could also - * have a reference value of 0 (although this is unlikely). - */ - -static struct ref_table tipc_ref_table; - -static DEFINE_SPINLOCK(ref_table_lock); - -/** - * tipc_ref_table_init - create reference table for objects - */ -int tipc_ref_table_init(u32 requested_size, u32 start) -{ - struct reference *table; - u32 actual_size; - - /* account for unused entry, then round up size to a power of 2 */ - - requested_size++; - for (actual_size = 16; actual_size < requested_size; actual_size <<= 1) - /* do nothing */ ; - - /* allocate table & mark all entries as uninitialized */ - table = vzalloc(actual_size * sizeof(struct reference)); - if (table == NULL) - return -ENOMEM; - - tipc_ref_table.entries = table; - tipc_ref_table.capacity = requested_size; - tipc_ref_table.init_point = 1; - tipc_ref_table.first_free = 0; - tipc_ref_table.last_free = 0; - tipc_ref_table.index_mask = actual_size - 1; - tipc_ref_table.start_mask = start & ~tipc_ref_table.index_mask; - - return 0; -} - -/** - * tipc_ref_table_stop - destroy reference table for objects - */ -void tipc_ref_table_stop(void) -{ - vfree(tipc_ref_table.entries); - tipc_ref_table.entries = NULL; -} - -/** - * tipc_ref_acquire - create reference to an object - * - * Register an object pointer in reference table and lock the object. - * Returns a unique reference value that is used from then on to retrieve the - * object pointer, or to determine that the object has been deregistered. - * - * Note: The object is returned in the locked state so that the caller can - * register a partially initialized object, without running the risk that - * the object will be accessed before initialization is complete. - */ -u32 tipc_ref_acquire(void *object, spinlock_t **lock) -{ - u32 index; - u32 index_mask; - u32 next_plus_upper; - u32 ref; - struct reference *entry = NULL; - - if (!object) { - pr_err("Attempt to acquire ref. to non-existent obj\n"); - return 0; - } - if (!tipc_ref_table.entries) { - pr_err("Ref. table not found in acquisition attempt\n"); - return 0; - } - - /* take a free entry, if available; otherwise initialize a new entry */ - spin_lock_bh(&ref_table_lock); - if (tipc_ref_table.first_free) { - index = tipc_ref_table.first_free; - entry = &(tipc_ref_table.entries[index]); - index_mask = tipc_ref_table.index_mask; - next_plus_upper = entry->ref; - tipc_ref_table.first_free = next_plus_upper & index_mask; - ref = (next_plus_upper & ~index_mask) + index; - } else if (tipc_ref_table.init_point < tipc_ref_table.capacity) { - index = tipc_ref_table.init_point++; - entry = &(tipc_ref_table.entries[index]); - spin_lock_init(&entry->lock); - ref = tipc_ref_table.start_mask + index; - } else { - ref = 0; - } - spin_unlock_bh(&ref_table_lock); - - /* - * Grab the lock so no one else can modify this entry - * While we assign its ref value & object pointer - */ - if (entry) { - spin_lock_bh(&entry->lock); - entry->ref = ref; - entry->object = object; - *lock = &entry->lock; - /* - * keep it locked, the caller is responsible - * for unlocking this when they're done with it - */ - } - - return ref; -} - -/** - * tipc_ref_discard - invalidate references to an object - * - * Disallow future references to an object and free up the entry for re-use. - * Note: The entry's spin_lock may still be busy after discard - */ -void tipc_ref_discard(u32 ref) -{ - struct reference *entry; - u32 index; - u32 index_mask; - - if (!tipc_ref_table.entries) { - pr_err("Ref. table not found during discard attempt\n"); - return; - } - - index_mask = tipc_ref_table.index_mask; - index = ref & index_mask; - entry = &(tipc_ref_table.entries[index]); - - spin_lock_bh(&ref_table_lock); - - if (!entry->object) { - pr_err("Attempt to discard ref. to non-existent obj\n"); - goto exit; - } - if (entry->ref != ref) { - pr_err("Attempt to discard non-existent reference\n"); - goto exit; - } - - /* - * mark entry as unused; increment instance part of entry's reference - * to invalidate any subsequent references - */ - entry->object = NULL; - entry->ref = (ref & ~index_mask) + (index_mask + 1); - - /* append entry to free entry list */ - if (tipc_ref_table.first_free == 0) - tipc_ref_table.first_free = index; - else - tipc_ref_table.entries[tipc_ref_table.last_free].ref |= index; - tipc_ref_table.last_free = index; - -exit: - spin_unlock_bh(&ref_table_lock); -} - -/** - * tipc_ref_lock - lock referenced object and return pointer to it - */ -void *tipc_ref_lock(u32 ref) -{ - if (likely(tipc_ref_table.entries)) { - struct reference *entry; - - entry = &tipc_ref_table.entries[ref & - tipc_ref_table.index_mask]; - if (likely(entry->ref != 0)) { - spin_lock_bh(&entry->lock); - if (likely((entry->ref == ref) && (entry->object))) - return entry->object; - spin_unlock_bh(&entry->lock); - } - } - return NULL; -} diff --git a/net/tipc/ref.h b/net/tipc/ref.h deleted file mode 100644 index d01aa1df63b8..000000000000 --- a/net/tipc/ref.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * net/tipc/ref.h: Include file for TIPC object registry code - * - * Copyright (c) 1991-2006, Ericsson AB - * Copyright (c) 2005-2006, Wind River Systems - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _TIPC_REF_H -#define _TIPC_REF_H - -int tipc_ref_table_init(u32 requested_size, u32 start); -void tipc_ref_table_stop(void); - -u32 tipc_ref_acquire(void *object, spinlock_t **lock); -void tipc_ref_discard(u32 ref); - -void *tipc_ref_lock(u32 ref); - -#endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ef0475568f9e..75275c5cf929 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -35,21 +35,84 @@ */ #include "core.h" -#include "port.h" +#include "name_table.h" #include "node.h" - +#include "link.h" #include <linux/export.h> +#include "config.h" +#include "socket.h" #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ -#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ +#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ +#define CONN_PROBING_INTERVAL 3600000 /* [ms] => 1 h */ +#define TIPC_FWD_MSG 1 +#define TIPC_CONN_OK 0 +#define TIPC_CONN_PROBING 1 + +/** + * struct tipc_sock - TIPC socket structure + * @sk: socket - interacts with 'port' and with user via the socket API + * @connected: non-zero if port is currently connected to a peer port + * @conn_type: TIPC type used when connection was established + * @conn_instance: TIPC instance used when connection was established + * @published: non-zero if port has one or more associated names + * @max_pkt: maximum packet size "hint" used when building messages sent by port + * @ref: unique reference to port in TIPC object registry + * @phdr: preformatted message header used when sending messages + * @port_list: adjacent ports in TIPC's global list of ports + * @publications: list of publications for port + * @pub_count: total # of publications port has made during its lifetime + * @probing_state: + * @probing_interval: + * @timer: + * @port: port - interacts with 'sk' and with the rest of the TIPC stack + * @peer_name: the peer of the connection, if any + * @conn_timeout: the time we can wait for an unresponded setup request + * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue + * @link_cong: non-zero if owner must sleep because of link congestion + * @sent_unacked: # messages sent by socket, and not yet acked by peer + * @rcv_unacked: # messages read by user, but not yet acked back to peer + */ +struct tipc_sock { + struct sock sk; + int connected; + u32 conn_type; + u32 conn_instance; + int published; + u32 max_pkt; + u32 ref; + struct tipc_msg phdr; + struct list_head sock_list; + struct list_head publications; + u32 pub_count; + u32 probing_state; + u32 probing_interval; + struct timer_list timer; + uint conn_timeout; + atomic_t dupl_rcvcnt; + bool link_cong; + uint sent_unacked; + uint rcv_unacked; +}; static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); +static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); +static void tipc_sk_timeout(unsigned long ref); +static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, + struct tipc_name_seq const *seq); +static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, + struct tipc_name_seq const *seq); +static u32 tipc_sk_ref_acquire(struct tipc_sock *tsk); +static void tipc_sk_ref_discard(u32 ref); +static struct tipc_sock *tipc_sk_get(u32 ref); +static struct tipc_sock *tipc_sk_get_next(u32 *ref); +static void tipc_sk_put(struct tipc_sock *tsk); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -103,29 +166,115 @@ static struct proto tipc_proto_kern; * - port reference */ -#include "socket.h" +static u32 tsk_peer_node(struct tipc_sock *tsk) +{ + return msg_destnode(&tsk->phdr); +} + +static u32 tsk_peer_port(struct tipc_sock *tsk) +{ + return msg_destport(&tsk->phdr); +} + +static bool tsk_unreliable(struct tipc_sock *tsk) +{ + return msg_src_droppable(&tsk->phdr) != 0; +} + +static void tsk_set_unreliable(struct tipc_sock *tsk, bool unreliable) +{ + msg_set_src_droppable(&tsk->phdr, unreliable ? 1 : 0); +} + +static bool tsk_unreturnable(struct tipc_sock *tsk) +{ + return msg_dest_droppable(&tsk->phdr) != 0; +} + +static void tsk_set_unreturnable(struct tipc_sock *tsk, bool unreturnable) +{ + msg_set_dest_droppable(&tsk->phdr, unreturnable ? 1 : 0); +} + +static int tsk_importance(struct tipc_sock *tsk) +{ + return msg_importance(&tsk->phdr); +} + +static int tsk_set_importance(struct tipc_sock *tsk, int imp) +{ + if (imp > TIPC_CRITICAL_IMPORTANCE) + return -EINVAL; + msg_set_importance(&tsk->phdr, (u32)imp); + return 0; +} + +static struct tipc_sock *tipc_sk(const struct sock *sk) +{ + return container_of(sk, struct tipc_sock, sk); +} + +static int tsk_conn_cong(struct tipc_sock *tsk) +{ + return tsk->sent_unacked >= TIPC_FLOWCTRL_WIN; +} /** - * advance_rx_queue - discard first buffer in socket receive queue + * tsk_advance_rx_queue - discard first buffer in socket receive queue * * Caller must hold socket lock */ -static void advance_rx_queue(struct sock *sk) +static void tsk_advance_rx_queue(struct sock *sk) { kfree_skb(__skb_dequeue(&sk->sk_receive_queue)); } /** - * reject_rx_queue - reject all buffers in socket receive queue + * tsk_rej_rx_queue - reject all buffers in socket receive queue * * Caller must hold socket lock */ -static void reject_rx_queue(struct sock *sk) +static void tsk_rej_rx_queue(struct sock *sk) { struct sk_buff *buf; + u32 dnode; + + while ((buf = __skb_dequeue(&sk->sk_receive_queue))) { + if (tipc_msg_reverse(buf, &dnode, TIPC_ERR_NO_PORT)) + tipc_link_xmit(buf, dnode, 0); + } +} + +/* tsk_peer_msg - verify if message was sent by connected port's peer + * + * Handles cases where the node's network address has changed from + * the default of <0.0.0> to its configured setting. + */ +static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) +{ + u32 peer_port = tsk_peer_port(tsk); + u32 orig_node; + u32 peer_node; + + if (unlikely(!tsk->connected)) + return false; - while ((buf = __skb_dequeue(&sk->sk_receive_queue))) - tipc_reject_msg(buf, TIPC_ERR_NO_PORT); + if (unlikely(msg_origport(msg) != peer_port)) + return false; + + orig_node = msg_orignode(msg); + peer_node = tsk_peer_node(tsk); + + if (likely(orig_node == peer_node)) + return true; + + if (!orig_node && (peer_node == tipc_own_addr)) + return true; + + if (!peer_node && (orig_node == tipc_own_addr)) + return true; + + return false; } /** @@ -147,7 +296,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, socket_state state; struct sock *sk; struct tipc_sock *tsk; - struct tipc_port *port; + struct tipc_msg *msg; u32 ref; /* Validate arguments */ @@ -182,32 +331,36 @@ static int tipc_sk_create(struct net *net, struct socket *sock, return -ENOMEM; tsk = tipc_sk(sk); - port = &tsk->port; - - ref = tipc_port_init(port, TIPC_LOW_IMPORTANCE); + ref = tipc_sk_ref_acquire(tsk); if (!ref) { - pr_warn("Socket registration failed, ref. table exhausted\n"); - sk_free(sk); + pr_warn("Socket create failed; reference table exhausted\n"); return -ENOMEM; } + tsk->max_pkt = MAX_PKT_DEFAULT; + tsk->ref = ref; + INIT_LIST_HEAD(&tsk->publications); + msg = &tsk->phdr; + tipc_msg_init(msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, + NAMED_H_SIZE, 0); + msg_set_origport(msg, ref); /* Finish initializing socket data structures */ sock->ops = ops; sock->state = state; - sock_init_data(sock, sk); + k_init_timer(&tsk->timer, (Handler)tipc_sk_timeout, ref); sk->sk_backlog_rcv = tipc_backlog_rcv; sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; + tsk->sent_unacked = 0; atomic_set(&tsk->dupl_rcvcnt, 0); - tipc_port_unlock(port); if (sock->state == SS_READY) { - tipc_port_set_unreturnable(port, true); + tsk_set_unreturnable(tsk, true); if (sock->type == SOCK_DGRAM) - tipc_port_set_unreliable(port, true); + tsk_set_unreliable(tsk, true); } return 0; } @@ -301,8 +454,8 @@ static int tipc_release(struct socket *sock) { struct sock *sk = sock->sk; struct tipc_sock *tsk; - struct tipc_port *port; struct sk_buff *buf; + u32 dnode; /* * Exit if socket isn't fully initialized (occurs when a failed accept() @@ -312,13 +465,13 @@ static int tipc_release(struct socket *sock) return 0; tsk = tipc_sk(sk); - port = &tsk->port; lock_sock(sk); /* * Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer) */ + dnode = tsk_peer_node(tsk); while (sock->state != SS_DISCONNECTING) { buf = __skb_dequeue(&sk->sk_receive_queue); if (buf == NULL) @@ -329,16 +482,27 @@ static int tipc_release(struct socket *sock) if ((sock->state == SS_CONNECTING) || (sock->state == SS_CONNECTED)) { sock->state = SS_DISCONNECTING; - tipc_port_disconnect(port->ref); + tsk->connected = 0; + tipc_node_remove_conn(dnode, tsk->ref); } - tipc_reject_msg(buf, TIPC_ERR_NO_PORT); + if (tipc_msg_reverse(buf, &dnode, TIPC_ERR_NO_PORT)) + tipc_link_xmit(buf, dnode, 0); } } - /* Destroy TIPC port; also disconnects an active connection and - * sends a 'FIN-' to peer. - */ - tipc_port_destroy(port); + tipc_sk_withdraw(tsk, 0, NULL); + tipc_sk_ref_discard(tsk->ref); + k_cancel_timer(&tsk->timer); + if (tsk->connected) { + buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, + SHORT_H_SIZE, 0, dnode, tipc_own_addr, + tsk_peer_port(tsk), + tsk->ref, TIPC_ERR_NO_PORT); + if (buf) + tipc_link_xmit(buf, dnode, tsk->ref); + tipc_node_remove_conn(dnode, tsk->ref); + } + k_term_timer(&tsk->timer); /* Discard any remaining (connection-based) messages in receive queue */ __skb_queue_purge(&sk->sk_receive_queue); @@ -346,7 +510,6 @@ static int tipc_release(struct socket *sock) /* Reject any messages that accumulated in backlog queue */ sock->state = SS_DISCONNECTING; release_sock(sk); - sock_put(sk); sock->sk = NULL; @@ -378,7 +541,7 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, lock_sock(sk); if (unlikely(!uaddr_len)) { - res = tipc_withdraw(&tsk->port, 0, NULL); + res = tipc_sk_withdraw(tsk, 0, NULL); goto exit; } @@ -406,8 +569,8 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, } res = (addr->scope > 0) ? - tipc_publish(&tsk->port, addr->scope, &addr->addr.nameseq) : - tipc_withdraw(&tsk->port, -addr->scope, &addr->addr.nameseq); + tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq) : + tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq); exit: release_sock(sk); return res; @@ -437,10 +600,10 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, if ((sock->state != SS_CONNECTED) && ((peer != 2) || (sock->state != SS_DISCONNECTING))) return -ENOTCONN; - addr->addr.id.ref = tipc_port_peerport(&tsk->port); - addr->addr.id.node = tipc_port_peernode(&tsk->port); + addr->addr.id.ref = tsk_peer_port(tsk); + addr->addr.id.node = tsk_peer_node(tsk); } else { - addr->addr.id.ref = tsk->port.ref; + addr->addr.id.ref = tsk->ref; addr->addr.id.node = tipc_own_addr; } @@ -504,12 +667,12 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, switch ((int)sock->state) { case SS_UNCONNECTED: - if (!tsk->port.congested) + if (!tsk->link_cong) mask |= POLLOUT; break; case SS_READY: case SS_CONNECTED: - if (!tsk->port.congested) + if (!tsk->link_cong && !tsk_conn_cong(tsk)) mask |= POLLOUT; /* fall thru' */ case SS_CONNECTING: @@ -526,6 +689,136 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, } /** + * tipc_sendmcast - send multicast message + * @sock: socket structure + * @seq: destination address + * @iov: message data to send + * @dsz: total length of message data + * @timeo: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, + struct iovec *iov, size_t dsz, long timeo) +{ + struct sock *sk = sock->sk; + struct tipc_msg *mhdr = &tipc_sk(sk)->phdr; + struct sk_buff *buf; + uint mtu; + int rc; + + msg_set_type(mhdr, TIPC_MCAST_MSG); + msg_set_lookup_scope(mhdr, TIPC_CLUSTER_SCOPE); + msg_set_destport(mhdr, 0); + msg_set_destnode(mhdr, 0); + msg_set_nametype(mhdr, seq->type); + msg_set_namelower(mhdr, seq->lower); + msg_set_nameupper(mhdr, seq->upper); + msg_set_hdr_sz(mhdr, MCAST_H_SIZE); + +new_mtu: + mtu = tipc_bclink_get_mtu(); + rc = tipc_msg_build(mhdr, iov, 0, dsz, mtu, &buf); + if (unlikely(rc < 0)) + return rc; + + do { + rc = tipc_bclink_xmit(buf); + if (likely(rc >= 0)) { + rc = dsz; + break; + } + if (rc == -EMSGSIZE) + goto new_mtu; + if (rc != -ELINKCONG) + break; + tipc_sk(sk)->link_cong = 1; + rc = tipc_wait_for_sndmsg(sock, &timeo); + if (rc) + kfree_skb_list(buf); + } while (!rc); + return rc; +} + +/* tipc_sk_mcast_rcv - Deliver multicast message to all destination sockets + */ +void tipc_sk_mcast_rcv(struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + struct tipc_port_list dports = {0, NULL, }; + struct tipc_port_list *item; + struct sk_buff *b; + uint i, last, dst = 0; + u32 scope = TIPC_CLUSTER_SCOPE; + + if (in_own_node(msg_orignode(msg))) + scope = TIPC_NODE_SCOPE; + + /* Create destination port list: */ + tipc_nametbl_mc_translate(msg_nametype(msg), + msg_namelower(msg), + msg_nameupper(msg), + scope, + &dports); + last = dports.count; + if (!last) { + kfree_skb(buf); + return; + } + + for (item = &dports; item; item = item->next) { + for (i = 0; i < PLSIZE && ++dst <= last; i++) { + b = (dst != last) ? skb_clone(buf, GFP_ATOMIC) : buf; + if (!b) { + pr_warn("Failed do clone mcast rcv buffer\n"); + continue; + } + msg_set_destport(msg, item->ports[i]); + tipc_sk_rcv(b); + } + } + tipc_port_list_free(&dports); +} + +/** + * tipc_sk_proto_rcv - receive a connection mng protocol message + * @tsk: receiving socket + * @dnode: node to send response message to, if any + * @buf: buffer containing protocol message + * Returns 0 (TIPC_OK) if message was consumed, 1 (TIPC_FWD_MSG) if + * (CONN_PROBE_REPLY) message should be forwarded. + */ +static int tipc_sk_proto_rcv(struct tipc_sock *tsk, u32 *dnode, + struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + int conn_cong; + + /* Ignore if connection cannot be validated: */ + if (!tsk_peer_msg(tsk, msg)) + goto exit; + + tsk->probing_state = TIPC_CONN_OK; + + if (msg_type(msg) == CONN_ACK) { + conn_cong = tsk_conn_cong(tsk); + tsk->sent_unacked -= msg_msgcnt(msg); + if (conn_cong) + tsk->sk.sk_write_space(&tsk->sk); + } else if (msg_type(msg) == CONN_PROBE) { + if (!tipc_msg_reverse(buf, dnode, TIPC_OK)) + return TIPC_OK; + msg_set_type(msg, CONN_PROBE_REPLY); + return TIPC_FWD_MSG; + } + /* Do nothing if msg_type() == CONN_PROBE_REPLY */ +exit: + kfree_skb(buf); + return TIPC_OK; +} + +/** * dest_name_check - verify user is permitted to send to specified port name * @dest: destination address * @m: descriptor for message to be sent @@ -539,6 +832,8 @@ static int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m) { struct tipc_cfg_msg_hdr hdr; + if (unlikely(dest->addrtype == TIPC_ADDR_ID)) + return 0; if (likely(dest->addr.name.name.type >= TIPC_RESERVED_TYPES)) return 0; if (likely(dest->addr.name.name.type == TIPC_TOP_SRV)) @@ -575,19 +870,18 @@ static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) return sock_intr_errno(*timeo_p); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - done = sk_wait_event(sk, timeo_p, !tsk->port.congested); + done = sk_wait_event(sk, timeo_p, !tsk->link_cong); finish_wait(sk_sleep(sk), &wait); } while (!done); return 0; } - /** * tipc_sendmsg - send message in connectionless manner * @iocb: if NULL, indicates that socket lock is already held * @sock: socket structure * @m: message to send - * @total_len: length of message + * @dsz: amount of user data to be sent * * Message must have an destination specified explicitly. * Used for SOCK_RDM and SOCK_DGRAM messages, @@ -597,100 +891,122 @@ static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) * Returns the number of bytes sent on success, or errno otherwise */ static int tipc_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) + struct msghdr *m, size_t dsz) { + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; - DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); - int needs_conn; + struct tipc_msg *mhdr = &tsk->phdr; + struct iovec *iov = m->msg_iov; + u32 dnode, dport; + struct sk_buff *buf; + struct tipc_name_seq *seq = &dest->addr.nameseq; + u32 mtu; long timeo; - int res = -EINVAL; + int rc = -EINVAL; if (unlikely(!dest)) return -EDESTADDRREQ; + if (unlikely((m->msg_namelen < sizeof(*dest)) || (dest->family != AF_TIPC))) return -EINVAL; - if (total_len > TIPC_MAX_USER_MSG_SIZE) + + if (dsz > TIPC_MAX_USER_MSG_SIZE) return -EMSGSIZE; if (iocb) lock_sock(sk); - needs_conn = (sock->state != SS_READY); - if (unlikely(needs_conn)) { + if (unlikely(sock->state != SS_READY)) { if (sock->state == SS_LISTENING) { - res = -EPIPE; + rc = -EPIPE; goto exit; } if (sock->state != SS_UNCONNECTED) { - res = -EISCONN; + rc = -EISCONN; goto exit; } - if (tsk->port.published) { - res = -EOPNOTSUPP; + if (tsk->published) { + rc = -EOPNOTSUPP; goto exit; } if (dest->addrtype == TIPC_ADDR_NAME) { - tsk->port.conn_type = dest->addr.name.name.type; - tsk->port.conn_instance = dest->addr.name.name.instance; + tsk->conn_type = dest->addr.name.name.type; + tsk->conn_instance = dest->addr.name.name.instance; } - - /* Abort any pending connection attempts (very unlikely) */ - reject_rx_queue(sk); } + rc = dest_name_check(dest, m); + if (rc) + goto exit; timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); - do { - if (dest->addrtype == TIPC_ADDR_NAME) { - res = dest_name_check(dest, m); - if (res) - break; - res = tipc_send2name(port, - &dest->addr.name.name, - dest->addr.name.domain, - m->msg_iov, - total_len); - } else if (dest->addrtype == TIPC_ADDR_ID) { - res = tipc_send2port(port, - &dest->addr.id, - m->msg_iov, - total_len); - } else if (dest->addrtype == TIPC_ADDR_MCAST) { - if (needs_conn) { - res = -EOPNOTSUPP; - break; - } - res = dest_name_check(dest, m); - if (res) - break; - res = tipc_port_mcast_xmit(port, - &dest->addr.nameseq, - m->msg_iov, - total_len); + + if (dest->addrtype == TIPC_ADDR_MCAST) { + rc = tipc_sendmcast(sock, seq, iov, dsz, timeo); + goto exit; + } else if (dest->addrtype == TIPC_ADDR_NAME) { + u32 type = dest->addr.name.name.type; + u32 inst = dest->addr.name.name.instance; + u32 domain = dest->addr.name.domain; + + dnode = domain; + msg_set_type(mhdr, TIPC_NAMED_MSG); + msg_set_hdr_sz(mhdr, NAMED_H_SIZE); + msg_set_nametype(mhdr, type); + msg_set_nameinst(mhdr, inst); + msg_set_lookup_scope(mhdr, tipc_addr_scope(domain)); + dport = tipc_nametbl_translate(type, inst, &dnode); + msg_set_destnode(mhdr, dnode); + msg_set_destport(mhdr, dport); + if (unlikely(!dport && !dnode)) { + rc = -EHOSTUNREACH; + goto exit; } - if (likely(res != -ELINKCONG)) { - if (needs_conn && (res >= 0)) + } else if (dest->addrtype == TIPC_ADDR_ID) { + dnode = dest->addr.id.node; + msg_set_type(mhdr, TIPC_DIRECT_MSG); + msg_set_lookup_scope(mhdr, 0); + msg_set_destnode(mhdr, dnode); + msg_set_destport(mhdr, dest->addr.id.ref); + msg_set_hdr_sz(mhdr, BASIC_H_SIZE); + } + +new_mtu: + mtu = tipc_node_get_mtu(dnode, tsk->ref); + rc = tipc_msg_build(mhdr, iov, 0, dsz, mtu, &buf); + if (rc < 0) + goto exit; + + do { + TIPC_SKB_CB(buf)->wakeup_pending = tsk->link_cong; + rc = tipc_link_xmit(buf, dnode, tsk->ref); + if (likely(rc >= 0)) { + if (sock->state != SS_READY) sock->state = SS_CONNECTING; + rc = dsz; break; } - res = tipc_wait_for_sndmsg(sock, &timeo); - if (res) + if (rc == -EMSGSIZE) + goto new_mtu; + if (rc != -ELINKCONG) break; - } while (1); - + tsk->link_cong = 1; + rc = tipc_wait_for_sndmsg(sock, &timeo); + if (rc) + kfree_skb_list(buf); + } while (!rc); exit: if (iocb) release_sock(sk); - return res; + + return rc; } static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; DEFINE_WAIT(wait); int done; @@ -709,37 +1025,48 @@ static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); done = sk_wait_event(sk, timeo_p, - (!port->congested || !port->connected)); + (!tsk->link_cong && + !tsk_conn_cong(tsk)) || + !tsk->connected); finish_wait(sk_sleep(sk), &wait); } while (!done); return 0; } /** - * tipc_send_packet - send a connection-oriented message - * @iocb: if NULL, indicates that socket lock is already held + * tipc_send_stream - send stream-oriented data + * @iocb: (unused) * @sock: socket structure - * @m: message to send - * @total_len: length of message + * @m: data to send + * @dsz: total length of data to be transmitted * - * Used for SOCK_SEQPACKET messages and SOCK_STREAM data. + * Used for SOCK_STREAM data. * - * Returns the number of bytes sent on success, or errno otherwise + * Returns the number of bytes sent on success (or partial success), + * or errno if no data sent */ -static int tipc_send_packet(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) +static int tipc_send_stream(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t dsz) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *mhdr = &tsk->phdr; + struct sk_buff *buf; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); - int res = -EINVAL; + u32 ref = tsk->ref; + int rc = -EINVAL; long timeo; + u32 dnode; + uint mtu, send, sent = 0; /* Handle implied connection establishment */ - if (unlikely(dest)) - return tipc_sendmsg(iocb, sock, m, total_len); - - if (total_len > TIPC_MAX_USER_MSG_SIZE) + if (unlikely(dest)) { + rc = tipc_sendmsg(iocb, sock, m, dsz); + if (dsz && (dsz == rc)) + tsk->sent_unacked = 1; + return rc; + } + if (dsz > (uint)INT_MAX) return -EMSGSIZE; if (iocb) @@ -747,148 +1074,88 @@ static int tipc_send_packet(struct kiocb *iocb, struct socket *sock, if (unlikely(sock->state != SS_CONNECTED)) { if (sock->state == SS_DISCONNECTING) - res = -EPIPE; + rc = -EPIPE; else - res = -ENOTCONN; + rc = -ENOTCONN; goto exit; } timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + dnode = tsk_peer_node(tsk); + +next: + mtu = tsk->max_pkt; + send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE); + rc = tipc_msg_build(mhdr, m->msg_iov, sent, send, mtu, &buf); + if (unlikely(rc < 0)) + goto exit; do { - res = tipc_send(&tsk->port, m->msg_iov, total_len); - if (likely(res != -ELINKCONG)) - break; - res = tipc_wait_for_sndpkt(sock, &timeo); - if (res) - break; - } while (1); + if (likely(!tsk_conn_cong(tsk))) { + rc = tipc_link_xmit(buf, dnode, ref); + if (likely(!rc)) { + tsk->sent_unacked++; + sent += send; + if (sent == dsz) + break; + goto next; + } + if (rc == -EMSGSIZE) { + tsk->max_pkt = tipc_node_get_mtu(dnode, ref); + goto next; + } + if (rc != -ELINKCONG) + break; + tsk->link_cong = 1; + } + rc = tipc_wait_for_sndpkt(sock, &timeo); + if (rc) + kfree_skb_list(buf); + } while (!rc); exit: if (iocb) release_sock(sk); - return res; + return sent ? sent : rc; } /** - * tipc_send_stream - send stream-oriented data - * @iocb: (unused) + * tipc_send_packet - send a connection-oriented message + * @iocb: if NULL, indicates that socket lock is already held * @sock: socket structure - * @m: data to send - * @total_len: total length of data to be sent + * @m: message to send + * @dsz: length of data to be transmitted * - * Used for SOCK_STREAM data. + * Used for SOCK_SEQPACKET messages. * - * Returns the number of bytes sent on success (or partial success), - * or errno if no data sent + * Returns the number of bytes sent on success, or errno otherwise */ -static int tipc_send_stream(struct kiocb *iocb, struct socket *sock, - struct msghdr *m, size_t total_len) +static int tipc_send_packet(struct kiocb *iocb, struct socket *sock, + struct msghdr *m, size_t dsz) { - struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); - struct msghdr my_msg; - struct iovec my_iov; - struct iovec *curr_iov; - int curr_iovlen; - char __user *curr_start; - u32 hdr_size; - int curr_left; - int bytes_to_send; - int bytes_sent; - int res; - - lock_sock(sk); - - /* Handle special cases where there is no connection */ - if (unlikely(sock->state != SS_CONNECTED)) { - if (sock->state == SS_UNCONNECTED) - res = tipc_send_packet(NULL, sock, m, total_len); - else - res = sock->state == SS_DISCONNECTING ? -EPIPE : -ENOTCONN; - goto exit; - } - - if (unlikely(m->msg_name)) { - res = -EISCONN; - goto exit; - } - - if (total_len > (unsigned int)INT_MAX) { - res = -EMSGSIZE; - goto exit; - } - - /* - * Send each iovec entry using one or more messages - * - * Note: This algorithm is good for the most likely case - * (i.e. one large iovec entry), but could be improved to pass sets - * of small iovec entries into send_packet(). - */ - curr_iov = m->msg_iov; - curr_iovlen = m->msg_iovlen; - my_msg.msg_iov = &my_iov; - my_msg.msg_iovlen = 1; - my_msg.msg_flags = m->msg_flags; - my_msg.msg_name = NULL; - bytes_sent = 0; - - hdr_size = msg_hdr_sz(&tsk->port.phdr); - - while (curr_iovlen--) { - curr_start = curr_iov->iov_base; - curr_left = curr_iov->iov_len; - - while (curr_left) { - bytes_to_send = tsk->port.max_pkt - hdr_size; - if (bytes_to_send > TIPC_MAX_USER_MSG_SIZE) - bytes_to_send = TIPC_MAX_USER_MSG_SIZE; - if (curr_left < bytes_to_send) - bytes_to_send = curr_left; - my_iov.iov_base = curr_start; - my_iov.iov_len = bytes_to_send; - res = tipc_send_packet(NULL, sock, &my_msg, - bytes_to_send); - if (res < 0) { - if (bytes_sent) - res = bytes_sent; - goto exit; - } - curr_left -= bytes_to_send; - curr_start += bytes_to_send; - bytes_sent += bytes_to_send; - } + if (dsz > TIPC_MAX_USER_MSG_SIZE) + return -EMSGSIZE; - curr_iov++; - } - res = bytes_sent; -exit: - release_sock(sk); - return res; + return tipc_send_stream(iocb, sock, m, dsz); } -/** - * auto_connect - complete connection setup to a remote port - * @tsk: tipc socket structure - * @msg: peer's response message - * - * Returns 0 on success, errno otherwise +/* tipc_sk_finish_conn - complete the setup of a connection */ -static int auto_connect(struct tipc_sock *tsk, struct tipc_msg *msg) +static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, + u32 peer_node) { - struct tipc_port *port = &tsk->port; - struct socket *sock = tsk->sk.sk_socket; - struct tipc_portid peer; - - peer.ref = msg_origport(msg); - peer.node = msg_orignode(msg); - - __tipc_port_connect(port->ref, port, &peer); - - if (msg_importance(msg) > TIPC_CRITICAL_IMPORTANCE) - return -EINVAL; - msg_set_importance(&port->phdr, (u32)msg_importance(msg)); - sock->state = SS_CONNECTED; - return 0; + struct tipc_msg *msg = &tsk->phdr; + + msg_set_destnode(msg, peer_node); + msg_set_destport(msg, peer_port); + msg_set_type(msg, TIPC_CONN_MSG); + msg_set_lookup_scope(msg, 0); + msg_set_hdr_sz(msg, SHORT_H_SIZE); + + tsk->probing_interval = CONN_PROBING_INTERVAL; + tsk->probing_state = TIPC_CONN_OK; + tsk->connected = 1; + k_start_timer(&tsk->timer, tsk->probing_interval); + tipc_node_add_conn(peer_node, tsk->ref, peer_port); + tsk->max_pkt = tipc_node_get_mtu(peer_node, tsk->ref); } /** @@ -915,17 +1182,17 @@ static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg) } /** - * anc_data_recv - optionally capture ancillary data for received message + * tipc_sk_anc_data_recv - optionally capture ancillary data for received message * @m: descriptor for message info * @msg: received message header - * @tport: TIPC port associated with message + * @tsk: TIPC port associated with message * * Note: Ancillary data is not captured if not requested by receiver. * * Returns 0 if successful, otherwise errno */ -static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, - struct tipc_port *tport) +static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, + struct tipc_sock *tsk) { u32 anc_data[3]; u32 err; @@ -968,10 +1235,10 @@ static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, anc_data[2] = msg_nameupper(msg); break; case TIPC_CONN_MSG: - has_name = (tport->conn_type != 0); - anc_data[0] = tport->conn_type; - anc_data[1] = tport->conn_instance; - anc_data[2] = tport->conn_instance; + has_name = (tsk->conn_type != 0); + anc_data[0] = tsk->conn_type; + anc_data[1] = tsk->conn_instance; + anc_data[2] = tsk->conn_instance; break; default: has_name = 0; @@ -985,6 +1252,24 @@ static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, return 0; } +static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack) +{ + struct sk_buff *buf = NULL; + struct tipc_msg *msg; + u32 peer_port = tsk_peer_port(tsk); + u32 dnode = tsk_peer_node(tsk); + + if (!tsk->connected) + return; + buf = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0, dnode, + tipc_own_addr, peer_port, tsk->ref, TIPC_OK); + if (!buf) + return; + msg = buf_msg(buf); + msg_set_msgcnt(msg, ack); + tipc_link_xmit(buf, dnode, msg_link_selector(msg)); +} + static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) { struct sock *sk = sock->sk; @@ -1035,7 +1320,6 @@ static int tipc_recvmsg(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; struct sk_buff *buf; struct tipc_msg *msg; long timeo; @@ -1070,7 +1354,7 @@ restart: /* Discard an empty non-errored message & try again */ if ((!sz) && (!err)) { - advance_rx_queue(sk); + tsk_advance_rx_queue(sk); goto restart; } @@ -1078,7 +1362,7 @@ restart: set_orig_addr(m, msg); /* Capture ancillary data (optional) */ - res = anc_data_recv(m, msg, port); + res = tipc_sk_anc_data_recv(m, msg, tsk); if (res) goto exit; @@ -1104,9 +1388,11 @@ restart: /* Consume received message (optional) */ if (likely(!(flags & MSG_PEEK))) { if ((sock->state != SS_READY) && - (++port->conn_unacked >= TIPC_CONNACK_INTV)) - tipc_acknowledge(port->ref, port->conn_unacked); - advance_rx_queue(sk); + (++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { + tipc_sk_send_ack(tsk, tsk->rcv_unacked); + tsk->rcv_unacked = 0; + } + tsk_advance_rx_queue(sk); } exit: release_sock(sk); @@ -1130,7 +1416,6 @@ static int tipc_recv_stream(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; struct sk_buff *buf; struct tipc_msg *msg; long timeo; @@ -1168,14 +1453,14 @@ restart: /* Discard an empty non-errored message & try again */ if ((!sz) && (!err)) { - advance_rx_queue(sk); + tsk_advance_rx_queue(sk); goto restart; } /* Optionally capture sender's address & ancillary data of first msg */ if (sz_copied == 0) { set_orig_addr(m, msg); - res = anc_data_recv(m, msg, port); + res = tipc_sk_anc_data_recv(m, msg, tsk); if (res) goto exit; } @@ -1213,9 +1498,11 @@ restart: /* Consume received message (optional) */ if (likely(!(flags & MSG_PEEK))) { - if (unlikely(++port->conn_unacked >= TIPC_CONNACK_INTV)) - tipc_acknowledge(port->ref, port->conn_unacked); - advance_rx_queue(sk); + if (unlikely(++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { + tipc_sk_send_ack(tsk, tsk->rcv_unacked); + tsk->rcv_unacked = 0; + } + tsk_advance_rx_queue(sk); } /* Loop around if more data is required */ @@ -1269,18 +1556,14 @@ static void tipc_data_ready(struct sock *sk) * @tsk: TIPC socket * @msg: message * - * Returns TIPC error status code and socket error status code - * once it encounters some errors + * Returns 0 (TIPC_OK) if everyting ok, -TIPC_ERR_NO_PORT otherwise */ -static u32 filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) +static int filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) { struct sock *sk = &tsk->sk; - struct tipc_port *port = &tsk->port; struct socket *sock = sk->sk_socket; struct tipc_msg *msg = buf_msg(*buf); - - u32 retval = TIPC_ERR_NO_PORT; - int res; + int retval = -TIPC_ERR_NO_PORT; if (msg_mcast(msg)) return retval; @@ -1288,16 +1571,23 @@ static u32 filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) switch ((int)sock->state) { case SS_CONNECTED: /* Accept only connection-based messages sent by peer */ - if (msg_connected(msg) && tipc_port_peer_msg(port, msg)) { + if (tsk_peer_msg(tsk, msg)) { if (unlikely(msg_errcode(msg))) { sock->state = SS_DISCONNECTING; - __tipc_port_disconnect(port); + tsk->connected = 0; + /* let timer expire on it's own */ + tipc_node_remove_conn(tsk_peer_node(tsk), + tsk->ref); } retval = TIPC_OK; } break; case SS_CONNECTING: /* Accept only ACK or NACK message */ + + if (unlikely(!msg_connected(msg))) + break; + if (unlikely(msg_errcode(msg))) { sock->state = SS_DISCONNECTING; sk->sk_err = ECONNREFUSED; @@ -1305,17 +1595,17 @@ static u32 filter_connect(struct tipc_sock *tsk, struct sk_buff **buf) break; } - if (unlikely(!msg_connected(msg))) - break; - - res = auto_connect(tsk, msg); - if (res) { + if (unlikely(msg_importance(msg) > TIPC_CRITICAL_IMPORTANCE)) { sock->state = SS_DISCONNECTING; - sk->sk_err = -res; + sk->sk_err = EINVAL; retval = TIPC_OK; break; } + tipc_sk_finish_conn(tsk, msg_origport(msg), msg_orignode(msg)); + msg_set_importance(&tsk->phdr, msg_importance(msg)); + sock->state = SS_CONNECTED; + /* If an incoming message is an 'ACK-', it should be * discarded here because it doesn't contain useful * data. In addition, we should try to wake up @@ -1382,32 +1672,44 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) * * Called with socket lock already taken; port lock may also be taken. * - * Returns TIPC error status code (TIPC_OK if message is not to be rejected) + * Returns 0 (TIPC_OK) if message was consumed, -TIPC error code if message + * to be rejected, 1 (TIPC_FWD_MSG) if (CONN_MANAGER) message to be forwarded */ -static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) +static int filter_rcv(struct sock *sk, struct sk_buff *buf) { struct socket *sock = sk->sk_socket; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *msg = buf_msg(buf); unsigned int limit = rcvbuf_limit(sk, buf); - u32 res = TIPC_OK; + u32 onode; + int rc = TIPC_OK; + + if (unlikely(msg_user(msg) == CONN_MANAGER)) + return tipc_sk_proto_rcv(tsk, &onode, buf); + + if (unlikely(msg_user(msg) == SOCK_WAKEUP)) { + kfree_skb(buf); + tsk->link_cong = 0; + sk->sk_write_space(sk); + return TIPC_OK; + } /* Reject message if it is wrong sort of message for socket */ if (msg_type(msg) > TIPC_DIRECT_MSG) - return TIPC_ERR_NO_PORT; + return -TIPC_ERR_NO_PORT; if (sock->state == SS_READY) { if (msg_connected(msg)) - return TIPC_ERR_NO_PORT; + return -TIPC_ERR_NO_PORT; } else { - res = filter_connect(tsk, &buf); - if (res != TIPC_OK || buf == NULL) - return res; + rc = filter_connect(tsk, &buf); + if (rc != TIPC_OK || buf == NULL) + return rc; } /* Reject message if there isn't room to queue it */ if (sk_rmem_alloc_get(sk) + buf->truesize >= limit) - return TIPC_ERR_OVERLOAD; + return -TIPC_ERR_OVERLOAD; /* Enqueue message */ TIPC_SKB_CB(buf)->handle = NULL; @@ -1429,16 +1731,23 @@ static u32 filter_rcv(struct sock *sk, struct sk_buff *buf) */ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *buf) { - u32 res; + int rc; + u32 onode; struct tipc_sock *tsk = tipc_sk(sk); uint truesize = buf->truesize; - res = filter_rcv(sk, buf); - if (unlikely(res)) - tipc_reject_msg(buf, res); + rc = filter_rcv(sk, buf); + + if (likely(!rc)) { + if (atomic_read(&tsk->dupl_rcvcnt) < TIPC_CONN_OVERLOAD_LIMIT) + atomic_add(truesize, &tsk->dupl_rcvcnt); + return 0; + } + + if ((rc < 0) && !tipc_msg_reverse(buf, &onode, -rc)) + return 0; - if (atomic_read(&tsk->dupl_rcvcnt) < TIPC_CONN_OVERLOAD_LIMIT) - atomic_add(truesize, &tsk->dupl_rcvcnt); + tipc_link_xmit(buf, onode, 0); return 0; } @@ -1452,49 +1761,42 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *buf) int tipc_sk_rcv(struct sk_buff *buf) { struct tipc_sock *tsk; - struct tipc_port *port; struct sock *sk; u32 dport = msg_destport(buf_msg(buf)); - int err = TIPC_OK; + int rc = TIPC_OK; uint limit; + u32 dnode; - /* Forward unresolved named message */ - if (unlikely(!dport)) { - tipc_net_route_msg(buf); - return 0; - } - - /* Validate destination */ - port = tipc_port_lock(dport); - if (unlikely(!port)) { - err = TIPC_ERR_NO_PORT; + /* Validate destination and message */ + tsk = tipc_sk_get(dport); + if (unlikely(!tsk)) { + rc = tipc_msg_eval(buf, &dnode); goto exit; } - - tsk = tipc_port_to_sock(port); sk = &tsk->sk; /* Queue message */ bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - err = filter_rcv(sk, buf); + rc = filter_rcv(sk, buf); } else { if (sk->sk_backlog.len == 0) atomic_set(&tsk->dupl_rcvcnt, 0); limit = rcvbuf_limit(sk, buf) + atomic_read(&tsk->dupl_rcvcnt); if (sk_add_backlog(sk, buf, limit)) - err = TIPC_ERR_OVERLOAD; + rc = -TIPC_ERR_OVERLOAD; } - bh_unlock_sock(sk); - tipc_port_unlock(port); - - if (likely(!err)) + tipc_sk_put(tsk); + if (likely(!rc)) return 0; exit: - tipc_reject_msg(buf, err); - return -EHOSTUNREACH; + if ((rc < 0) && !tipc_msg_reverse(buf, &dnode, -rc)) + return -EHOSTUNREACH; + + tipc_link_xmit(buf, dnode, 0); + return (rc < 0) ? -EHOSTUNREACH : 0; } static int tipc_wait_for_connect(struct socket *sock, long *timeo_p) @@ -1673,10 +1975,8 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) { struct sock *new_sk, *sk = sock->sk; struct sk_buff *buf; - struct tipc_port *new_port; + struct tipc_sock *new_tsock; struct tipc_msg *msg; - struct tipc_portid peer; - u32 new_ref; long timeo; int res; @@ -1698,8 +1998,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) goto exit; new_sk = new_sock->sk; - new_port = &tipc_sk(new_sk)->port; - new_ref = new_port->ref; + new_tsock = tipc_sk(new_sk); msg = buf_msg(buf); /* we lock on new_sk; but lockdep sees the lock on sk */ @@ -1709,18 +2008,16 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) * Reject any stray messages received by new socket * before the socket lock was taken (very, very unlikely) */ - reject_rx_queue(new_sk); + tsk_rej_rx_queue(new_sk); /* Connect new socket to it's peer */ - peer.ref = msg_origport(msg); - peer.node = msg_orignode(msg); - tipc_port_connect(new_ref, &peer); + tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg)); new_sock->state = SS_CONNECTED; - tipc_port_set_importance(new_port, msg_importance(msg)); + tsk_set_importance(new_tsock, msg_importance(msg)); if (msg_named(msg)) { - new_port->conn_type = msg_nametype(msg); - new_port->conn_instance = msg_nameinst(msg); + new_tsock->conn_type = msg_nametype(msg); + new_tsock->conn_instance = msg_nameinst(msg); } /* @@ -1730,7 +2027,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) if (!msg_data_sz(msg)) { struct msghdr m = {NULL,}; - advance_rx_queue(sk); + tsk_advance_rx_queue(sk); tipc_send_packet(NULL, new_sock, &m, 0); } else { __skb_dequeue(&sk->sk_receive_queue); @@ -1756,8 +2053,8 @@ static int tipc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; struct sk_buff *buf; + u32 dnode; int res; if (how != SHUT_RDWR) @@ -1777,14 +2074,21 @@ restart: kfree_skb(buf); goto restart; } - tipc_port_disconnect(port->ref); - tipc_reject_msg(buf, TIPC_CONN_SHUTDOWN); + if (tipc_msg_reverse(buf, &dnode, TIPC_CONN_SHUTDOWN)) + tipc_link_xmit(buf, dnode, tsk->ref); + tipc_node_remove_conn(dnode, tsk->ref); } else { - tipc_port_shutdown(port->ref); + dnode = tsk_peer_node(tsk); + buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, + TIPC_CONN_MSG, SHORT_H_SIZE, + 0, dnode, tipc_own_addr, + tsk_peer_port(tsk), + tsk->ref, TIPC_CONN_SHUTDOWN); + tipc_link_xmit(buf, dnode, tsk->ref); } - + tsk->connected = 0; sock->state = SS_DISCONNECTING; - + tipc_node_remove_conn(dnode, tsk->ref); /* fall through */ case SS_DISCONNECTING: @@ -1805,6 +2109,432 @@ restart: return res; } +static void tipc_sk_timeout(unsigned long ref) +{ + struct tipc_sock *tsk; + struct sock *sk; + struct sk_buff *buf = NULL; + u32 peer_port, peer_node; + + tsk = tipc_sk_get(ref); + if (!tsk) + return; + + sk = &tsk->sk; + bh_lock_sock(sk); + if (!tsk->connected) { + bh_unlock_sock(sk); + goto exit; + } + peer_port = tsk_peer_port(tsk); + peer_node = tsk_peer_node(tsk); + + if (tsk->probing_state == TIPC_CONN_PROBING) { + /* Previous probe not answered -> self abort */ + buf = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, + SHORT_H_SIZE, 0, tipc_own_addr, + peer_node, ref, peer_port, + TIPC_ERR_NO_PORT); + } else { + buf = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, + 0, peer_node, tipc_own_addr, + peer_port, ref, TIPC_OK); + tsk->probing_state = TIPC_CONN_PROBING; + k_start_timer(&tsk->timer, tsk->probing_interval); + } + bh_unlock_sock(sk); + if (buf) + tipc_link_xmit(buf, peer_node, ref); +exit: + tipc_sk_put(tsk); +} + +static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, + struct tipc_name_seq const *seq) +{ + struct publication *publ; + u32 key; + + if (tsk->connected) + return -EINVAL; + key = tsk->ref + tsk->pub_count + 1; + if (key == tsk->ref) + return -EADDRINUSE; + + publ = tipc_nametbl_publish(seq->type, seq->lower, seq->upper, + scope, tsk->ref, key); + if (unlikely(!publ)) + return -EINVAL; + + list_add(&publ->pport_list, &tsk->publications); + tsk->pub_count++; + tsk->published = 1; + return 0; +} + +static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, + struct tipc_name_seq const *seq) +{ + struct publication *publ; + struct publication *safe; + int rc = -EINVAL; + + list_for_each_entry_safe(publ, safe, &tsk->publications, pport_list) { + if (seq) { + if (publ->scope != scope) + continue; + if (publ->type != seq->type) + continue; + if (publ->lower != seq->lower) + continue; + if (publ->upper != seq->upper) + break; + tipc_nametbl_withdraw(publ->type, publ->lower, + publ->ref, publ->key); + rc = 0; + break; + } + tipc_nametbl_withdraw(publ->type, publ->lower, + publ->ref, publ->key); + rc = 0; + } + if (list_empty(&tsk->publications)) + tsk->published = 0; + return rc; +} + +static int tipc_sk_show(struct tipc_sock *tsk, char *buf, + int len, int full_id) +{ + struct publication *publ; + int ret; + + if (full_id) + ret = tipc_snprintf(buf, len, "<%u.%u.%u:%u>:", + tipc_zone(tipc_own_addr), + tipc_cluster(tipc_own_addr), + tipc_node(tipc_own_addr), tsk->ref); + else + ret = tipc_snprintf(buf, len, "%-10u:", tsk->ref); + + if (tsk->connected) { + u32 dport = tsk_peer_port(tsk); + u32 destnode = tsk_peer_node(tsk); + + ret += tipc_snprintf(buf + ret, len - ret, + " connected to <%u.%u.%u:%u>", + tipc_zone(destnode), + tipc_cluster(destnode), + tipc_node(destnode), dport); + if (tsk->conn_type != 0) + ret += tipc_snprintf(buf + ret, len - ret, + " via {%u,%u}", tsk->conn_type, + tsk->conn_instance); + } else if (tsk->published) { + ret += tipc_snprintf(buf + ret, len - ret, " bound to"); + list_for_each_entry(publ, &tsk->publications, pport_list) { + if (publ->lower == publ->upper) + ret += tipc_snprintf(buf + ret, len - ret, + " {%u,%u}", publ->type, + publ->lower); + else + ret += tipc_snprintf(buf + ret, len - ret, + " {%u,%u,%u}", publ->type, + publ->lower, publ->upper); + } + } + ret += tipc_snprintf(buf + ret, len - ret, "\n"); + return ret; +} + +struct sk_buff *tipc_sk_socks_show(void) +{ + struct sk_buff *buf; + struct tlv_desc *rep_tlv; + char *pb; + int pb_len; + struct tipc_sock *tsk; + int str_len = 0; + u32 ref = 0; + + buf = tipc_cfg_reply_alloc(TLV_SPACE(ULTRA_STRING_MAX_LEN)); + if (!buf) + return NULL; + rep_tlv = (struct tlv_desc *)buf->data; + pb = TLV_DATA(rep_tlv); + pb_len = ULTRA_STRING_MAX_LEN; + + tsk = tipc_sk_get_next(&ref); + for (; tsk; tsk = tipc_sk_get_next(&ref)) { + lock_sock(&tsk->sk); + str_len += tipc_sk_show(tsk, pb + str_len, + pb_len - str_len, 0); + release_sock(&tsk->sk); + tipc_sk_put(tsk); + } + str_len += 1; /* for "\0" */ + skb_put(buf, TLV_SPACE(str_len)); + TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); + + return buf; +} + +/* tipc_sk_reinit: set non-zero address in all existing sockets + * when we go from standalone to network mode. + */ +void tipc_sk_reinit(void) +{ + struct tipc_msg *msg; + u32 ref = 0; + struct tipc_sock *tsk = tipc_sk_get_next(&ref); + + for (; tsk; tsk = tipc_sk_get_next(&ref)) { + lock_sock(&tsk->sk); + msg = &tsk->phdr; + msg_set_prevnode(msg, tipc_own_addr); + msg_set_orignode(msg, tipc_own_addr); + release_sock(&tsk->sk); + tipc_sk_put(tsk); + } +} + +/** + * struct reference - TIPC socket reference entry + * @tsk: pointer to socket associated with reference entry + * @ref: reference value for socket (combines instance & array index info) + */ +struct reference { + struct tipc_sock *tsk; + u32 ref; +}; + +/** + * struct tipc_ref_table - table of TIPC socket reference entries + * @entries: pointer to array of reference entries + * @capacity: array index of first unusable entry + * @init_point: array index of first uninitialized entry + * @first_free: array index of first unused socket reference entry + * @last_free: array index of last unused socket reference entry + * @index_mask: bitmask for array index portion of reference values + * @start_mask: initial value for instance value portion of reference values + */ +struct ref_table { + struct reference *entries; + u32 capacity; + u32 init_point; + u32 first_free; + u32 last_free; + u32 index_mask; + u32 start_mask; +}; + +/* Socket reference table consists of 2**N entries. + * + * State Socket ptr Reference + * ----- ---------- --------- + * In use non-NULL XXXX|own index + * (XXXX changes each time entry is acquired) + * Free NULL YYYY|next free index + * (YYYY is one more than last used XXXX) + * Uninitialized NULL 0 + * + * Entry 0 is not used; this allows index 0 to denote the end of the free list. + * + * Note that a reference value of 0 does not necessarily indicate that an + * entry is uninitialized, since the last entry in the free list could also + * have a reference value of 0 (although this is unlikely). + */ + +static struct ref_table tipc_ref_table; + +static DEFINE_RWLOCK(ref_table_lock); + +/** + * tipc_ref_table_init - create reference table for sockets + */ +int tipc_sk_ref_table_init(u32 req_sz, u32 start) +{ + struct reference *table; + u32 actual_sz; + + /* account for unused entry, then round up size to a power of 2 */ + + req_sz++; + for (actual_sz = 16; actual_sz < req_sz; actual_sz <<= 1) { + /* do nothing */ + }; + + /* allocate table & mark all entries as uninitialized */ + table = vzalloc(actual_sz * sizeof(struct reference)); + if (table == NULL) + return -ENOMEM; + + tipc_ref_table.entries = table; + tipc_ref_table.capacity = req_sz; + tipc_ref_table.init_point = 1; + tipc_ref_table.first_free = 0; + tipc_ref_table.last_free = 0; + tipc_ref_table.index_mask = actual_sz - 1; + tipc_ref_table.start_mask = start & ~tipc_ref_table.index_mask; + + return 0; +} + +/** + * tipc_ref_table_stop - destroy reference table for sockets + */ +void tipc_sk_ref_table_stop(void) +{ + if (!tipc_ref_table.entries) + return; + vfree(tipc_ref_table.entries); + tipc_ref_table.entries = NULL; +} + +/* tipc_ref_acquire - create reference to a socket + * + * Register an socket pointer in the reference table. + * Returns a unique reference value that is used from then on to retrieve the + * socket pointer, or to determine if the socket has been deregistered. + */ +u32 tipc_sk_ref_acquire(struct tipc_sock *tsk) +{ + u32 index; + u32 index_mask; + u32 next_plus_upper; + u32 ref = 0; + struct reference *entry; + + if (unlikely(!tsk)) { + pr_err("Attempt to acquire ref. to non-existent obj\n"); + return 0; + } + if (unlikely(!tipc_ref_table.entries)) { + pr_err("Ref. table not found in acquisition attempt\n"); + return 0; + } + + /* Take a free entry, if available; otherwise initialize a new one */ + write_lock_bh(&ref_table_lock); + index = tipc_ref_table.first_free; + entry = &tipc_ref_table.entries[index]; + + if (likely(index)) { + index = tipc_ref_table.first_free; + entry = &tipc_ref_table.entries[index]; + index_mask = tipc_ref_table.index_mask; + next_plus_upper = entry->ref; + tipc_ref_table.first_free = next_plus_upper & index_mask; + ref = (next_plus_upper & ~index_mask) + index; + entry->tsk = tsk; + } else if (tipc_ref_table.init_point < tipc_ref_table.capacity) { + index = tipc_ref_table.init_point++; + entry = &tipc_ref_table.entries[index]; + ref = tipc_ref_table.start_mask + index; + } + + if (ref) { + entry->ref = ref; + entry->tsk = tsk; + } + write_unlock_bh(&ref_table_lock); + return ref; +} + +/* tipc_sk_ref_discard - invalidate reference to an socket + * + * Disallow future references to an socket and free up the entry for re-use. + */ +void tipc_sk_ref_discard(u32 ref) +{ + struct reference *entry; + u32 index; + u32 index_mask; + + if (unlikely(!tipc_ref_table.entries)) { + pr_err("Ref. table not found during discard attempt\n"); + return; + } + + index_mask = tipc_ref_table.index_mask; + index = ref & index_mask; + entry = &tipc_ref_table.entries[index]; + + write_lock_bh(&ref_table_lock); + + if (unlikely(!entry->tsk)) { + pr_err("Attempt to discard ref. to non-existent socket\n"); + goto exit; + } + if (unlikely(entry->ref != ref)) { + pr_err("Attempt to discard non-existent reference\n"); + goto exit; + } + + /* Mark entry as unused; increment instance part of entry's + * reference to invalidate any subsequent references + */ + + entry->tsk = NULL; + entry->ref = (ref & ~index_mask) + (index_mask + 1); + + /* Append entry to free entry list */ + if (unlikely(tipc_ref_table.first_free == 0)) + tipc_ref_table.first_free = index; + else + tipc_ref_table.entries[tipc_ref_table.last_free].ref |= index; + tipc_ref_table.last_free = index; +exit: + write_unlock_bh(&ref_table_lock); +} + +/* tipc_sk_get - find referenced socket and return pointer to it + */ +struct tipc_sock *tipc_sk_get(u32 ref) +{ + struct reference *entry; + struct tipc_sock *tsk; + + if (unlikely(!tipc_ref_table.entries)) + return NULL; + read_lock_bh(&ref_table_lock); + entry = &tipc_ref_table.entries[ref & tipc_ref_table.index_mask]; + tsk = entry->tsk; + if (likely(tsk && (entry->ref == ref))) + sock_hold(&tsk->sk); + else + tsk = NULL; + read_unlock_bh(&ref_table_lock); + return tsk; +} + +/* tipc_sk_get_next - lock & return next socket after referenced one +*/ +struct tipc_sock *tipc_sk_get_next(u32 *ref) +{ + struct reference *entry; + struct tipc_sock *tsk = NULL; + uint index = *ref & tipc_ref_table.index_mask; + + read_lock_bh(&ref_table_lock); + while (++index < tipc_ref_table.capacity) { + entry = &tipc_ref_table.entries[index]; + if (!entry->tsk) + continue; + tsk = entry->tsk; + sock_hold(&tsk->sk); + *ref = entry->ref; + break; + } + read_unlock_bh(&ref_table_lock); + return tsk; +} + +static void tipc_sk_put(struct tipc_sock *tsk) +{ + sock_put(&tsk->sk); +} + /** * tipc_setsockopt - set socket option * @sock: socket structure @@ -1823,7 +2553,6 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; u32 value; int res; @@ -1841,16 +2570,16 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - tipc_port_set_importance(port, value); + res = tsk_set_importance(tsk, value); break; case TIPC_SRC_DROPPABLE: if (sock->type != SOCK_STREAM) - tipc_port_set_unreliable(port, value); + tsk_set_unreliable(tsk, value); else res = -ENOPROTOOPT; break; case TIPC_DEST_DROPPABLE: - tipc_port_set_unreturnable(port, value); + tsk_set_unreturnable(tsk, value); break; case TIPC_CONN_TIMEOUT: tipc_sk(sk)->conn_timeout = value; @@ -1883,7 +2612,6 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_port *port = &tsk->port; int len; u32 value; int res; @@ -1900,16 +2628,16 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - value = tipc_port_importance(port); + value = tsk_importance(tsk); break; case TIPC_SRC_DROPPABLE: - value = tipc_port_unreliable(port); + value = tsk_unreliable(tsk); break; case TIPC_DEST_DROPPABLE: - value = tipc_port_unreturnable(port); + value = tsk_unreturnable(tsk); break; case TIPC_CONN_TIMEOUT: - value = tipc_sk(sk)->conn_timeout; + value = tsk->conn_timeout; /* no need to set "res", since already 0 at this point */ break; case TIPC_NODE_RECVQ_DEPTH: @@ -1936,7 +2664,7 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, return put_user(sizeof(value), ol); } -int tipc_ioctl(struct socket *sk, unsigned int cmd, unsigned long arg) +static int tipc_ioctl(struct socket *sk, unsigned int cmd, unsigned long arg) { struct tipc_sioc_ln_req lnr; void __user *argp = (void __user *)arg; @@ -1952,7 +2680,6 @@ int tipc_ioctl(struct socket *sk, unsigned int cmd, unsigned long arg) return 0; } return -EADDRNOTAVAIL; - break; default: return -ENOIOCTLCMD; } diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 3afcd2a70b31..baa43d03901e 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -35,40 +35,17 @@ #ifndef _TIPC_SOCK_H #define _TIPC_SOCK_H -#include "port.h" #include <net/sock.h> -/** - * struct tipc_sock - TIPC socket structure - * @sk: socket - interacts with 'port' and with user via the socket API - * @port: port - interacts with 'sk' and with the rest of the TIPC stack - * @peer_name: the peer of the connection, if any - * @conn_timeout: the time we can wait for an unresponded setup request - * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue - */ - -struct tipc_sock { - struct sock sk; - struct tipc_port port; - unsigned int conn_timeout; - atomic_t dupl_rcvcnt; -}; - -static inline struct tipc_sock *tipc_sk(const struct sock *sk) -{ - return container_of(sk, struct tipc_sock, sk); -} - -static inline struct tipc_sock *tipc_port_to_sock(const struct tipc_port *port) -{ - return container_of(port, struct tipc_sock, port); -} - -static inline void tipc_sock_wakeup(struct tipc_sock *tsk) -{ - tsk->sk.sk_write_space(&tsk->sk); -} - +#define TIPC_CONNACK_INTV 256 +#define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2) +#define TIPC_CONN_OVERLOAD_LIMIT ((TIPC_FLOWCTRL_WIN * 2 + 1) * \ + SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) int tipc_sk_rcv(struct sk_buff *buf); +struct sk_buff *tipc_sk_socks_show(void); +void tipc_sk_mcast_rcv(struct sk_buff *buf); +void tipc_sk_reinit(void); +int tipc_sk_ref_table_init(u32 requested_size, u32 start); +void tipc_sk_ref_table_stop(void); #endif diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 642437231ad5..31b5cb232a43 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -36,7 +36,6 @@ #include "core.h" #include "name_table.h" -#include "port.h" #include "subscr.h" /** diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index f3fef93325a8..1a779b1e8510 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -47,6 +47,13 @@ static struct ctl_table tipc_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "named_timeout", + .data = &sysctl_tipc_named_timeout, + .maxlen = sizeof(sysctl_tipc_named_timeout), + .mode = 0644, + .proc_handler = proc_dointvec, + }, {} }; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 9bc73f87f64a..99f7012b23b9 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -258,7 +258,7 @@ static void inc_inflight_move_tail(struct unix_sock *u) list_move_tail(&u->link, &gc_candidates); } -static bool gc_in_progress = false; +static bool gc_in_progress; #define UNIX_INFLIGHT_TRIGGER_GC 16000 void wait_for_unix_gc(void) diff --git a/net/wimax/id-table.c b/net/wimax/id-table.c index 72273abfcb16..a21508d11036 100644 --- a/net/wimax/id-table.c +++ b/net/wimax/id-table.c @@ -137,7 +137,7 @@ void wimax_id_table_release(void) #endif spin_lock(&wimax_id_table_lock); list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) { - printk(KERN_ERR "BUG: %s wimax_dev %p ifindex %d not cleared\n", + pr_err("BUG: %s wimax_dev %p ifindex %d not cleared\n", __func__, wimax_dev, wimax_dev->net_dev->ifindex); WARN_ON(1); } diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c index c278b3356f75..54aa146930bd 100644 --- a/net/wimax/op-msg.c +++ b/net/wimax/op-msg.c @@ -189,7 +189,7 @@ const void *wimax_msg_data_len(struct sk_buff *msg, size_t *size) nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), WIMAX_GNL_MSG_DATA); if (nla == NULL) { - printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n"); + pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); return NULL; } *size = nla_len(nla); @@ -211,7 +211,7 @@ const void *wimax_msg_data(struct sk_buff *msg) nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), WIMAX_GNL_MSG_DATA); if (nla == NULL) { - printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n"); + pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); return NULL; } return nla_data(nla); @@ -232,7 +232,7 @@ ssize_t wimax_msg_len(struct sk_buff *msg) nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr), WIMAX_GNL_MSG_DATA); if (nla == NULL) { - printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n"); + pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n"); return -EINVAL; } return nla_len(nla); @@ -343,8 +343,7 @@ int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info) d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); result = -ENODEV; if (info->attrs[WIMAX_GNL_MSG_IFIDX] == NULL) { - printk(KERN_ERR "WIMAX_GNL_MSG_FROM_USER: can't find IFIDX " - "attribute\n"); + pr_err("WIMAX_GNL_MSG_FROM_USER: can't find IFIDX attribute\n"); goto error_no_wimax_dev; } ifindex = nla_get_u32(info->attrs[WIMAX_GNL_MSG_IFIDX]); diff --git a/net/wimax/op-reset.c b/net/wimax/op-reset.c index eb4580784d9d..a42079165e1f 100644 --- a/net/wimax/op-reset.c +++ b/net/wimax/op-reset.c @@ -107,8 +107,7 @@ int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info) d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); result = -ENODEV; if (info->attrs[WIMAX_GNL_RESET_IFIDX] == NULL) { - printk(KERN_ERR "WIMAX_GNL_OP_RFKILL: can't find IFIDX " - "attribute\n"); + pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n"); goto error_no_wimax_dev; } ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RESET_IFIDX]); diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c index 403078d670a9..7d730543f243 100644 --- a/net/wimax/op-rfkill.c +++ b/net/wimax/op-rfkill.c @@ -421,8 +421,7 @@ int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info) d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); result = -ENODEV; if (info->attrs[WIMAX_GNL_RFKILL_IFIDX] == NULL) { - printk(KERN_ERR "WIMAX_GNL_OP_RFKILL: can't find IFIDX " - "attribute\n"); + pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n"); goto error_no_wimax_dev; } ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_IFIDX]); diff --git a/net/wimax/op-state-get.c b/net/wimax/op-state-get.c index 995c08c827b5..e6788d281d0e 100644 --- a/net/wimax/op-state-get.c +++ b/net/wimax/op-state-get.c @@ -49,8 +49,7 @@ int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info) d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info); result = -ENODEV; if (info->attrs[WIMAX_GNL_STGET_IFIDX] == NULL) { - printk(KERN_ERR "WIMAX_GNL_OP_STATE_GET: can't find IFIDX " - "attribute\n"); + pr_err("WIMAX_GNL_OP_STATE_GET: can't find IFIDX attribute\n"); goto error_no_wimax_dev; } ifindex = nla_get_u32(info->attrs[WIMAX_GNL_STGET_IFIDX]); diff --git a/net/wimax/stack.c b/net/wimax/stack.c index ec8b577db135..3f816e2971ee 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -191,8 +191,8 @@ void __check_new_state(enum wimax_st old_state, enum wimax_st new_state, unsigned int allowed_states_bm) { if (WARN_ON(((1 << new_state) & allowed_states_bm) == 0)) { - printk(KERN_ERR "SW BUG! Forbidden state change %u -> %u\n", - old_state, new_state); + pr_err("SW BUG! Forbidden state change %u -> %u\n", + old_state, new_state); } } @@ -602,8 +602,7 @@ int __init wimax_subsys_init(void) wimax_gnl_ops, wimax_gnl_mcgrps); if (unlikely(result < 0)) { - printk(KERN_ERR "cannot register generic netlink family: %d\n", - result); + pr_err("cannot register generic netlink family: %d\n", result); goto error_register_family; } diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h index b445b82020a8..733c4bf8d4b3 100644 --- a/net/wimax/wimax-internal.h +++ b/net/wimax/wimax-internal.h @@ -30,6 +30,12 @@ #define __WIMAX_INTERNAL_H__ #ifdef __KERNEL__ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/device.h> #include <net/wimax.h> diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 405f3c4cf70c..29c8675f9a11 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -162,6 +162,12 @@ config CFG80211_INTERNAL_REGDB and includes code to query that database. This is an alternative to using CRDA for defining regulatory rules for the kernel. + Using this option requires some parsing of the db.txt at build time, + the parser will be upkept with the latest wireless-regdb updates but + older wireless-regdb formats will be ignored. The parser may later + be replaced to avoid issues with conflicts on versions of + wireless-regdb. + For details see: http://wireless.kernel.org/en/developers/Regulatory diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 992b34070bcb..72d81e2154d5 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -4,6 +4,7 @@ * any point in time. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include <linux/export.h> diff --git a/net/wireless/core.c b/net/wireless/core.c index a1c40654dd9b..f52a4cd7017c 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -2,6 +2,7 @@ * This is the linux wireless configuration interface. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -25,7 +26,6 @@ #include "sysfs.h" #include "debugfs.h" #include "wext-compat.h" -#include "ethtool.h" #include "rdev-ops.h" /* name for sysfs, %d is appended */ @@ -493,12 +493,6 @@ int wiphy_register(struct wiphy *wiphy) int i; u16 ifmodes = wiphy->interface_modes; - /* - * There are major locking problems in nl80211/mac80211 for CSA, - * disable for all drivers until this has been reworked. - */ - wiphy->flags &= ~WIPHY_FLAG_HAS_CHANNEL_SWITCH; - #ifdef CONFIG_PM if (WARN_ON(wiphy->wowlan && (wiphy->wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) && @@ -636,6 +630,9 @@ int wiphy_register(struct wiphy *wiphy) if (IS_ERR(rdev->wiphy.debugfsdir)) rdev->wiphy.debugfsdir = NULL; + cfg80211_debugfs_rdev_add(rdev); + nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); + if (wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { struct regulatory_request request; @@ -647,8 +644,6 @@ int wiphy_register(struct wiphy *wiphy) nl80211_send_reg_change_event(&request); } - cfg80211_debugfs_rdev_add(rdev); - rdev->wiphy.registered = true; rtnl_unlock(); @@ -660,8 +655,6 @@ int wiphy_register(struct wiphy *wiphy) return res; } - nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); - return 0; } EXPORT_SYMBOL(wiphy_register); @@ -927,8 +920,6 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, /* allow mac80211 to determine the timeout */ wdev->ps_timeout = -1; - netdev_set_default_ethtool_ops(dev, &cfg80211_ethtool_ops); - if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) @@ -1015,7 +1006,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); #ifdef CONFIG_CFG80211_WEXT - kfree(wdev->wext.keys); + kzfree(wdev->wext.keys); #endif } /* diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c index d4860bfc020e..e9e91298c70d 100644 --- a/net/wireless/ethtool.c +++ b/net/wireless/ethtool.c @@ -1,11 +1,9 @@ #include <linux/utsname.h> #include <net/cfg80211.h> #include "core.h" -#include "ethtool.h" #include "rdev-ops.h" -static void cfg80211_get_drvinfo(struct net_device *dev, - struct ethtool_drvinfo *info) +void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -23,84 +21,4 @@ static void cfg80211_get_drvinfo(struct net_device *dev, strlcpy(info->bus_info, dev_name(wiphy_dev(wdev->wiphy)), sizeof(info->bus_info)); } - -static int cfg80211_get_regs_len(struct net_device *dev) -{ - /* For now, return 0... */ - return 0; -} - -static void cfg80211_get_regs(struct net_device *dev, struct ethtool_regs *regs, - void *data) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - - regs->version = wdev->wiphy->hw_version; - regs->len = 0; -} - -static void cfg80211_get_ringparam(struct net_device *dev, - struct ethtool_ringparam *rp) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - - memset(rp, 0, sizeof(*rp)); - - if (rdev->ops->get_ringparam) - rdev_get_ringparam(rdev, &rp->tx_pending, &rp->tx_max_pending, - &rp->rx_pending, &rp->rx_max_pending); -} - -static int cfg80211_set_ringparam(struct net_device *dev, - struct ethtool_ringparam *rp) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - - if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) - return -EINVAL; - - if (rdev->ops->set_ringparam) - return rdev_set_ringparam(rdev, rp->tx_pending, rp->rx_pending); - - return -ENOTSUPP; -} - -static int cfg80211_get_sset_count(struct net_device *dev, int sset) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - if (rdev->ops->get_et_sset_count) - return rdev_get_et_sset_count(rdev, dev, sset); - return -EOPNOTSUPP; -} - -static void cfg80211_get_stats(struct net_device *dev, - struct ethtool_stats *stats, u64 *data) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - if (rdev->ops->get_et_stats) - rdev_get_et_stats(rdev, dev, stats, data); -} - -static void cfg80211_get_strings(struct net_device *dev, u32 sset, u8 *data) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - if (rdev->ops->get_et_strings) - rdev_get_et_strings(rdev, dev, sset, data); -} - -const struct ethtool_ops cfg80211_ethtool_ops = { - .get_drvinfo = cfg80211_get_drvinfo, - .get_regs_len = cfg80211_get_regs_len, - .get_regs = cfg80211_get_regs, - .get_link = ethtool_op_get_link, - .get_ringparam = cfg80211_get_ringparam, - .set_ringparam = cfg80211_set_ringparam, - .get_strings = cfg80211_get_strings, - .get_ethtool_stats = cfg80211_get_stats, - .get_sset_count = cfg80211_get_sset_count, -}; +EXPORT_SYMBOL(cfg80211_get_drvinfo); diff --git a/net/wireless/ethtool.h b/net/wireless/ethtool.h deleted file mode 100644 index 695ecad20bd6..000000000000 --- a/net/wireless/ethtool.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __CFG80211_ETHTOOL__ -#define __CFG80211_ETHTOOL__ - -extern const struct ethtool_ops cfg80211_ethtool_ops; - -#endif /* __CFG80211_ETHTOOL__ */ diff --git a/net/wireless/genregdb.awk b/net/wireless/genregdb.awk index 40c37fc5b67c..baf2426b555a 100644 --- a/net/wireless/genregdb.awk +++ b/net/wireless/genregdb.awk @@ -51,32 +51,41 @@ function parse_country_head() { function parse_reg_rule() { + flag_starts_at = 7 + start = $1 sub(/\(/, "", start) end = $3 bw = $5 sub(/\),/, "", bw) - gain = $6 - sub(/\(/, "", gain) - sub(/,/, "", gain) - power = $7 - sub(/\)/, "", power) - sub(/,/, "", power) + gain = 0 + power = $6 # power might be in mW... - units = $8 + units = $7 + dfs_cac = 0 + + sub(/\(/, "", power) + sub(/\),/, "", power) + sub(/\),/, "", units) sub(/\)/, "", units) - sub(/,/, "", units) - dfs_cac = $9 + if (units == "mW") { + flag_starts_at = 8 power = 10 * log(power)/log(10) + if ($8 ~ /[[:digit:]]/) { + flag_starts_at = 9 + dfs_cac = $8 + } } else { - dfs_cac = $8 + if ($7 ~ /[[:digit:]]/) { + flag_starts_at = 8 + dfs_cac = $7 + } } - sub(/,/, "", dfs_cac) sub(/\(/, "", dfs_cac) - sub(/\)/, "", dfs_cac) + sub(/\),/, "", dfs_cac) flagstr = "" - for (i=8; i<=NF; i++) + for (i=flag_starts_at; i<=NF; i++) flagstr = flagstr $i split(flagstr, flagarray, ",") flags = "" diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 8f345da3ea5f..e24fc585c883 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -115,7 +115,7 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, } if (WARN_ON(wdev->connect_keys)) - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = connkeys; wdev->ibss_fixed = params->channel_fixed; @@ -161,7 +161,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) ASSERT_WDEV_LOCK(wdev); - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = NULL; rdev_set_qos_map(rdev, dev, NULL); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 266766b8d80b..2c52b59e43f3 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -19,7 +19,7 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, - const u8 *buf, size_t len) + const u8 *buf, size_t len, int uapsd_queues) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; @@ -43,7 +43,7 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, return; } - nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL); + nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL, uapsd_queues); /* update current_bss etc., consumes the bss reference */ __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs, status_code, @@ -605,7 +605,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, } bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, - const u8 *buf, size_t len, u32 flags, gfp_t gfp) + const u8 *buf, size_t len, u32 flags) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -648,7 +648,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, /* Indicate the received Action frame to user space */ if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, freq, sig_mbm, - buf, len, flags, gfp)) + buf, len, flags, GFP_ATOMIC)) continue; result = true; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6668daf69326..cb9f5a44ffad 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2,6 +2,7 @@ * This is the new netlink-based wireless configuration interface. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include <linux/if.h> @@ -225,6 +226,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 }, + [NL80211_ATTR_WIPHY_DYN_ACK] = { .type = NLA_FLAG }, [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 }, [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, @@ -337,6 +339,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_TDLS_OPERATION] = { .type = NLA_U8 }, [NL80211_ATTR_TDLS_SUPPORT] = { .type = NLA_FLAG }, [NL80211_ATTR_TDLS_EXTERNAL_SETUP] = { .type = NLA_FLAG }, + [NL80211_ATTR_TDLS_INITIATOR] = { .type = NLA_FLAG }, [NL80211_ATTR_DONT_WAIT_FOR_ACK] = { .type = NLA_FLAG }, [NL80211_ATTR_PROBE_RESP] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, @@ -387,6 +390,11 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 }, [NL80211_ATTR_IFACE_SOCKET_OWNER] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, + [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG }, + [NL80211_ATTR_TSID] = { .type = NLA_U8 }, + [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, + [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, + [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, }; /* policy for the key attributes */ @@ -1506,6 +1514,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH) CMD(channel_switch, CHANNEL_SWITCH); CMD(set_qos_map, SET_QOS_MAP); + if (rdev->wiphy.flags & + WIPHY_FLAG_SUPPORTS_WMM_ADMISSION) + CMD(add_tx_ts, ADD_TX_TS); } /* add into the if now */ #undef CMD @@ -2236,11 +2247,21 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) { + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) + return -EINVAL; + coverage_class = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]); changed |= WIPHY_PARAM_COVERAGE_CLASS; } + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { + if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) + return -EOPNOTSUPP; + + changed |= WIPHY_PARAM_DYN_ACK; + } + if (changed) { u8 old_retry_short, old_retry_long; u32 old_frag_threshold, old_rts_threshold; @@ -3325,6 +3346,29 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(params.acl); } + if (info->attrs[NL80211_ATTR_SMPS_MODE]) { + params.smps_mode = + nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); + switch (params.smps_mode) { + case NL80211_SMPS_OFF: + break; + case NL80211_SMPS_STATIC: + if (!(rdev->wiphy.features & + NL80211_FEATURE_STATIC_SMPS)) + return -EINVAL; + break; + case NL80211_SMPS_DYNAMIC: + if (!(rdev->wiphy.features & + NL80211_FEATURE_DYNAMIC_SMPS)) + return -EINVAL; + break; + default: + return -EINVAL; + } + } else { + params.smps_mode = NL80211_SMPS_OFF; + } + wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { @@ -3813,7 +3857,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, { if (params->listen_interval != -1) return -EINVAL; - if (params->aid) + if (params->aid && + !(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))) return -EINVAL; /* When you run into this, adjust the code below for the new flag */ @@ -6011,17 +6056,6 @@ skip_beacons: params.radar_required = true; } - /* TODO: I left this here for now. With channel switch, the - * verification is a bit more complicated, because we only do - * it later when the channel switch really happens. - */ - err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype, - params.chandef.chan, - CHAN_MODE_SHARED, - radar_detect_width); - if (err) - return err; - if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX]) params.block_tx = true; @@ -6042,7 +6076,6 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, const struct cfg80211_bss_ies *ies; void *hdr; struct nlattr *bss; - bool tsf = false; ASSERT_WDEV_LOCK(wdev); @@ -6069,18 +6102,27 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, goto nla_put_failure; rcu_read_lock(); + /* indicate whether we have probe response data or not */ + if (rcu_access_pointer(res->proberesp_ies) && + nla_put_flag(msg, NL80211_BSS_PRESP_DATA)) + goto fail_unlock_rcu; + + /* this pointer prefers to be pointed to probe response data + * but is always valid + */ ies = rcu_dereference(res->ies); if (ies) { if (nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf)) goto fail_unlock_rcu; - tsf = true; if (ies->len && nla_put(msg, NL80211_BSS_INFORMATION_ELEMENTS, ies->len, ies->data)) goto fail_unlock_rcu; } + + /* and this pointer is always (unless driver didn't know) beacon data */ ies = rcu_dereference(res->beacon_ies); - if (ies) { - if (!tsf && nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf)) + if (ies && ies->from_beacon) { + if (nla_put_u64(msg, NL80211_BSS_BEACON_TSF, ies->tsf)) goto fail_unlock_rcu; if (ies->len && nla_put(msg, NL80211_BSS_BEACON_IES, ies->len, ies->data)) @@ -6584,6 +6626,14 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) sizeof(req.vht_capa)); } + if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { + if (!(rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || + !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + return -EINVAL; + req.flags |= ASSOC_REQ_USE_RRM; + } + err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); if (!err) { wdev_lock(dev->ieee80211_ptr); @@ -6846,7 +6896,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys); if (err) - kfree(connkeys); + kzfree(connkeys); return err; } @@ -6978,6 +7028,9 @@ void __cfg80211_send_event_skb(struct sk_buff *skb, gfp_t gfp) struct nlattr *data = ((void **)skb->cb)[2]; enum nl80211_multicast_groups mcgrp = NL80211_MCGRP_TESTMODE; + /* clear CB data for netlink core to own from now on */ + memset(skb->cb, 0, sizeof(skb->cb)); + nla_nest_end(skb, data); genlmsg_end(skb, hdr); @@ -7215,7 +7268,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) { if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) { - kfree(connkeys); + kzfree(connkeys); return -EINVAL; } memcpy(&connect.ht_capa, @@ -7233,7 +7286,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) { if (!info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) { - kfree(connkeys); + kzfree(connkeys); return -EINVAL; } memcpy(&connect.vht_capa, @@ -7241,11 +7294,19 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) sizeof(connect.vht_capa)); } + if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { + if (!(rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || + !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + return -EINVAL; + connect.flags |= ASSOC_REQ_USE_RRM; + } + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL); wdev_unlock(dev->ieee80211_ptr); if (err) - kfree(connkeys); + kzfree(connkeys); return err; } @@ -7364,6 +7425,7 @@ static int nl80211_tdls_mgmt(struct sk_buff *skb, struct genl_info *info) u32 peer_capability = 0; u16 status_code; u8 *peer; + bool initiator; if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) || !rdev->ops->tdls_mgmt) @@ -7380,12 +7442,14 @@ static int nl80211_tdls_mgmt(struct sk_buff *skb, struct genl_info *info) action_code = nla_get_u8(info->attrs[NL80211_ATTR_TDLS_ACTION]); status_code = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]); dialog_token = nla_get_u8(info->attrs[NL80211_ATTR_TDLS_DIALOG_TOKEN]); + initiator = nla_get_flag(info->attrs[NL80211_ATTR_TDLS_INITIATOR]); if (info->attrs[NL80211_ATTR_TDLS_PEER_CAPABILITY]) peer_capability = nla_get_u32(info->attrs[NL80211_ATTR_TDLS_PEER_CAPABILITY]); return rdev_tdls_mgmt(rdev, dev, peer, action_code, dialog_token, status_code, peer_capability, + initiator, nla_data(info->attrs[NL80211_ATTR_IE]), nla_len(info->attrs[NL80211_ATTR_IE])); } @@ -8928,13 +8992,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN) return -ERANGE; - memcpy(rekey_data.kek, nla_data(tb[NL80211_REKEY_DATA_KEK]), - NL80211_KEK_LEN); - memcpy(rekey_data.kck, nla_data(tb[NL80211_REKEY_DATA_KCK]), - NL80211_KCK_LEN); - memcpy(rekey_data.replay_ctr, - nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]), - NL80211_REPLAY_CTR_LEN); + rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]); + rekey_data.kck = nla_data(tb[NL80211_REKEY_DATA_KCK]); + rekey_data.replay_ctr = nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]); wdev_lock(wdev); if (!wdev->current_bss) { @@ -9300,6 +9360,9 @@ int cfg80211_vendor_cmd_reply(struct sk_buff *skb) void *hdr = ((void **)skb->cb)[1]; struct nlattr *data = ((void **)skb->cb)[2]; + /* clear CB data for netlink core to own from now on */ + memset(skb->cb, 0, sizeof(skb->cb)); + if (WARN_ON(!rdev->cur_cmd_info)) { kfree_skb(skb); return -EINVAL; @@ -9363,6 +9426,93 @@ static int nl80211_set_qos_map(struct sk_buff *skb, return ret; } +static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *peer; + u8 tsid, up; + u16 admitted_time = 0; + int err; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_USER_PRIO]) + return -EINVAL; + + tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); + if (tsid >= IEEE80211_NUM_TIDS) + return -EINVAL; + + up = nla_get_u8(info->attrs[NL80211_ATTR_USER_PRIO]); + if (up >= IEEE80211_NUM_UPS) + return -EINVAL; + + /* WMM uses TIDs 0-7 even for TSPEC */ + if (tsid < IEEE80211_FIRST_TSPEC_TSID) { + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) + return -EINVAL; + } else { + /* TODO: handle 802.11 TSPEC/admission control + * need more attributes for that (e.g. BA session requirement) + */ + return -EINVAL; + } + + peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (info->attrs[NL80211_ATTR_ADMITTED_TIME]) { + admitted_time = + nla_get_u16(info->attrs[NL80211_ATTR_ADMITTED_TIME]); + if (!admitted_time) + return -EINVAL; + } + + wdev_lock(wdev); + switch (wdev->iftype) { + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (wdev->current_bss) + break; + err = -ENOTCONN; + goto out; + default: + err = -EOPNOTSUPP; + goto out; + } + + err = rdev_add_tx_ts(rdev, dev, tsid, peer, up, admitted_time); + + out: + wdev_unlock(wdev); + return err; +} + +static int nl80211_del_tx_ts(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *peer; + u8 tsid; + int err; + + if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); + peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + + wdev_lock(wdev); + err = rdev_del_tx_ts(rdev, dev, tsid, peer); + wdev_unlock(wdev); + + return err; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -9373,6 +9523,7 @@ static int nl80211_set_qos_map(struct sk_buff *skb, /* If a netdev is associated, it must be UP, P2P must be started */ #define NL80211_FLAG_NEED_WDEV_UP (NL80211_FLAG_NEED_WDEV |\ NL80211_FLAG_CHECK_NETDEV_UP) +#define NL80211_FLAG_CLEAR_SKB 0x20 static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) @@ -9456,8 +9607,20 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, dev_put(info->user_ptr[1]); } } + if (ops->internal_flags & NL80211_FLAG_NEED_RTNL) rtnl_unlock(); + + /* If needed, clear the netlink message payload from the SKB + * as it might contain key data that shouldn't stick around on + * the heap after the SKB is freed. The netlink message header + * is still needed for further processing, so leave it intact. + */ + if (ops->internal_flags & NL80211_FLAG_CLEAR_SKB) { + struct nlmsghdr *nlh = nlmsg_hdr(skb); + + memset(nlmsg_data(nlh), 0, nlmsg_len(nlh)); + } } static const struct genl_ops nl80211_ops[] = { @@ -9525,7 +9688,8 @@ static const struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_NEW_KEY, @@ -9533,7 +9697,8 @@ static const struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DEL_KEY, @@ -9711,7 +9876,8 @@ static const struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_ASSOCIATE, @@ -9945,7 +10111,8 @@ static const struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_TDLS_MGMT, @@ -10103,6 +10270,22 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_ADD_TX_TS, + .doit = nl80211_add_tx_ts, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_TX_TS, + .doit = nl80211_del_tx_ts, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; /* notification functions */ @@ -10371,7 +10554,8 @@ nla_put_failure: static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, - enum nl80211_commands cmd, gfp_t gfp) + enum nl80211_commands cmd, gfp_t gfp, + int uapsd_queues) { struct sk_buff *msg; void *hdr; @@ -10391,6 +10575,19 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_FRAME, len, buf)) goto nla_put_failure; + if (uapsd_queues >= 0) { + struct nlattr *nla_wmm = + nla_nest_start(msg, NL80211_ATTR_STA_WME); + if (!nla_wmm) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_STA_WME_UAPSD_QUEUES, + uapsd_queues)) + goto nla_put_failure; + + nla_nest_end(msg, nla_wmm); + } + genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, @@ -10407,15 +10604,15 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, size_t len, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_AUTHENTICATE, gfp); + NL80211_CMD_AUTHENTICATE, gfp, -1); } void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, - size_t len, gfp_t gfp) + size_t len, gfp_t gfp, int uapsd_queues) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_ASSOCIATE, gfp); + NL80211_CMD_ASSOCIATE, gfp, uapsd_queues); } void nl80211_send_deauth(struct cfg80211_registered_device *rdev, @@ -10423,7 +10620,7 @@ void nl80211_send_deauth(struct cfg80211_registered_device *rdev, size_t len, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_DEAUTHENTICATE, gfp); + NL80211_CMD_DEAUTHENTICATE, gfp, -1); } void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, @@ -10431,7 +10628,7 @@ void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, size_t len, gfp_t gfp) { nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_DISASSOCIATE, gfp); + NL80211_CMD_DISASSOCIATE, gfp, -1); } void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, @@ -10452,7 +10649,7 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, cmd = NL80211_CMD_UNPROT_DISASSOCIATE; trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len); - nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC); + nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1); } EXPORT_SYMBOL(cfg80211_rx_unprot_mlme_mgmt); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 49c9a482dd12..7ad70d6f0cc6 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -23,7 +23,8 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, const u8 *buf, size_t len, gfp_t gfp); void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, - const u8 *buf, size_t len, gfp_t gfp); + const u8 *buf, size_t len, gfp_t gfp, + int uapsd_queues); void nl80211_send_deauth(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, gfp_t gfp); diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index d95bbe348138..f6d457d6a558 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -714,25 +714,6 @@ static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev, return ret; } -static inline int rdev_set_ringparam(struct cfg80211_registered_device *rdev, - u32 tx, u32 rx) -{ - int ret; - trace_rdev_set_ringparam(&rdev->wiphy, tx, rx); - ret = rdev->ops->set_ringparam(&rdev->wiphy, tx, rx); - trace_rdev_return_int(&rdev->wiphy, ret); - return ret; -} - -static inline void rdev_get_ringparam(struct cfg80211_registered_device *rdev, - u32 *tx, u32 *tx_max, u32 *rx, - u32 *rx_max) -{ - trace_rdev_get_ringparam(&rdev->wiphy); - rdev->ops->get_ringparam(&rdev->wiphy, tx, tx_max, rx, rx_max); - trace_rdev_return_void_tx_rx(&rdev->wiphy, *tx, *tx_max, *rx, *rx_max); -} - static inline int rdev_sched_scan_start(struct cfg80211_registered_device *rdev, struct net_device *dev, @@ -770,15 +751,15 @@ static inline int rdev_tdls_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, - const u8 *buf, size_t len) + bool initiator, const u8 *buf, size_t len) { int ret; trace_rdev_tdls_mgmt(&rdev->wiphy, dev, peer, action_code, dialog_token, status_code, peer_capability, - buf, len); + initiator, buf, len); ret = rdev->ops->tdls_mgmt(&rdev->wiphy, dev, peer, action_code, dialog_token, status_code, peer_capability, - buf, len); + initiator, buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } @@ -816,35 +797,6 @@ static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev, } static inline int -rdev_get_et_sset_count(struct cfg80211_registered_device *rdev, - struct net_device *dev, int sset) -{ - int ret; - trace_rdev_get_et_sset_count(&rdev->wiphy, dev, sset); - ret = rdev->ops->get_et_sset_count(&rdev->wiphy, dev, sset); - trace_rdev_return_int(&rdev->wiphy, ret); - return ret; -} - -static inline void rdev_get_et_stats(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct ethtool_stats *stats, u64 *data) -{ - trace_rdev_get_et_stats(&rdev->wiphy, dev); - rdev->ops->get_et_stats(&rdev->wiphy, dev, stats, data); - trace_rdev_return_void(&rdev->wiphy); -} - -static inline void rdev_get_et_strings(struct cfg80211_registered_device *rdev, - struct net_device *dev, u32 sset, - u8 *data) -{ - trace_rdev_get_et_strings(&rdev->wiphy, dev, sset); - rdev->ops->get_et_strings(&rdev->wiphy, dev, sset, data); - trace_rdev_return_void(&rdev->wiphy); -} - -static inline int rdev_get_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef) @@ -963,4 +915,35 @@ rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_add_tx_ts(struct cfg80211_registered_device *rdev, + struct net_device *dev, u8 tsid, const u8 *peer, + u8 user_prio, u16 admitted_time) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer, + user_prio, admitted_time); + if (rdev->ops->add_tx_ts) + ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer, + user_prio, admitted_time); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + +static inline int +rdev_del_tx_ts(struct cfg80211_registered_device *rdev, + struct net_device *dev, u8 tsid, const u8 *peer) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer); + if (rdev->ops->del_tx_ts) + ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 1afdf45db38f..b725a31a4751 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> + * Copyright 2013-2014 Intel Mobile Communications GmbH * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -798,6 +799,57 @@ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1, return 0; } +/* check whether old rule contains new rule */ +static bool rule_contains(struct ieee80211_reg_rule *r1, + struct ieee80211_reg_rule *r2) +{ + /* for simplicity, currently consider only same flags */ + if (r1->flags != r2->flags) + return false; + + /* verify r1 is more restrictive */ + if ((r1->power_rule.max_antenna_gain > + r2->power_rule.max_antenna_gain) || + r1->power_rule.max_eirp > r2->power_rule.max_eirp) + return false; + + /* make sure r2's range is contained within r1 */ + if (r1->freq_range.start_freq_khz > r2->freq_range.start_freq_khz || + r1->freq_range.end_freq_khz < r2->freq_range.end_freq_khz) + return false; + + /* and finally verify that r1.max_bw >= r2.max_bw */ + if (r1->freq_range.max_bandwidth_khz < + r2->freq_range.max_bandwidth_khz) + return false; + + return true; +} + +/* add or extend current rules. do nothing if rule is already contained */ +static void add_rule(struct ieee80211_reg_rule *rule, + struct ieee80211_reg_rule *reg_rules, u32 *n_rules) +{ + struct ieee80211_reg_rule *tmp_rule; + int i; + + for (i = 0; i < *n_rules; i++) { + tmp_rule = ®_rules[i]; + /* rule is already contained - do nothing */ + if (rule_contains(tmp_rule, rule)) + return; + + /* extend rule if possible */ + if (rule_contains(rule, tmp_rule)) { + memcpy(tmp_rule, rule, sizeof(*rule)); + return; + } + } + + memcpy(®_rules[*n_rules], rule, sizeof(*rule)); + (*n_rules)++; +} + /** * regdom_intersect - do the intersection between two regulatory domains * @rd1: first regulatory domain @@ -817,12 +869,10 @@ regdom_intersect(const struct ieee80211_regdomain *rd1, { int r, size_of_regd; unsigned int x, y; - unsigned int num_rules = 0, rule_idx = 0; + unsigned int num_rules = 0; const struct ieee80211_reg_rule *rule1, *rule2; - struct ieee80211_reg_rule *intersected_rule; + struct ieee80211_reg_rule intersected_rule; struct ieee80211_regdomain *rd; - /* This is just a dummy holder to help us count */ - struct ieee80211_reg_rule dummy_rule; if (!rd1 || !rd2) return NULL; @@ -840,7 +890,7 @@ regdom_intersect(const struct ieee80211_regdomain *rd1, for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; if (!reg_rules_intersect(rd1, rd2, rule1, rule2, - &dummy_rule)) + &intersected_rule)) num_rules++; } } @@ -855,34 +905,24 @@ regdom_intersect(const struct ieee80211_regdomain *rd1, if (!rd) return NULL; - for (x = 0; x < rd1->n_reg_rules && rule_idx < num_rules; x++) { + for (x = 0; x < rd1->n_reg_rules; x++) { rule1 = &rd1->reg_rules[x]; - for (y = 0; y < rd2->n_reg_rules && rule_idx < num_rules; y++) { + for (y = 0; y < rd2->n_reg_rules; y++) { rule2 = &rd2->reg_rules[y]; - /* - * This time around instead of using the stack lets - * write to the target rule directly saving ourselves - * a memcpy() - */ - intersected_rule = &rd->reg_rules[rule_idx]; r = reg_rules_intersect(rd1, rd2, rule1, rule2, - intersected_rule); + &intersected_rule); /* * No need to memset here the intersected rule here as * we're not using the stack anymore */ if (r) continue; - rule_idx++; - } - } - if (rule_idx != num_rules) { - kfree(rd); - return NULL; + add_rule(&intersected_rule, rd->reg_rules, + &rd->n_reg_rules); + } } - rd->n_reg_rules = num_rules; rd->alpha2[0] = '9'; rd->alpha2[1] = '8'; rd->dfs_region = reg_intersect_dfs_region(rd1->dfs_region, diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 0798c62e6085..bda39f149810 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2,6 +2,7 @@ * cfg80211 scan result handling * * Copyright 2008 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include <linux/kernel.h> #include <linux/slab.h> @@ -884,6 +885,7 @@ struct cfg80211_bss* cfg80211_inform_bss_width(struct wiphy *wiphy, struct ieee80211_channel *rx_channel, enum nl80211_bss_scan_width scan_width, + enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, s32 signal, gfp_t gfp) @@ -911,21 +913,32 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, tmp.pub.beacon_interval = beacon_interval; tmp.pub.capability = capability; /* - * Since we do not know here whether the IEs are from a Beacon or Probe + * If we do not know here whether the IEs are from a Beacon or Probe * Response frame, we need to pick one of the options and only use it * with the driver that does not provide the full Beacon/Probe Response * frame. Use Beacon frame pointer to avoid indicating that this should * override the IEs pointer should we have received an earlier * indication of Probe Response data. */ - ies = kmalloc(sizeof(*ies) + ielen, gfp); + ies = kzalloc(sizeof(*ies) + ielen, gfp); if (!ies) return NULL; ies->len = ielen; ies->tsf = tsf; + ies->from_beacon = false; memcpy(ies->data, ie, ielen); - rcu_assign_pointer(tmp.pub.beacon_ies, ies); + switch (ftype) { + case CFG80211_BSS_FTYPE_BEACON: + ies->from_beacon = true; + /* fall through to assign */ + case CFG80211_BSS_FTYPE_UNKNOWN: + rcu_assign_pointer(tmp.pub.beacon_ies, ies); + break; + case CFG80211_BSS_FTYPE_PRESP: + rcu_assign_pointer(tmp.pub.proberesp_ies, ies); + break; + } rcu_assign_pointer(tmp.pub.ies, ies); signal_valid = abs(rx_channel->center_freq - channel->center_freq) <= @@ -982,11 +995,12 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, if (!channel) return NULL; - ies = kmalloc(sizeof(*ies) + ielen, gfp); + ies = kzalloc(sizeof(*ies) + ielen, gfp); if (!ies) return NULL; ies->len = ielen; ies->tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); + ies->from_beacon = ieee80211_is_beacon(mgmt->frame_control); memcpy(ies->data, mgmt->u.probe_resp.variable, ielen); if (ieee80211_is_probe_resp(mgmt->frame_control)) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 8bbeeb302216..dc1668ff543b 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -641,7 +641,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, } if (status != WLAN_STATUS_SUCCESS) { - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = NULL; wdev->ssid_len = 0; if (bss) { @@ -918,7 +918,7 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, ASSERT_WDEV_LOCK(wdev); if (WARN_ON(wdev->connect_keys)) { - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = NULL; } @@ -978,7 +978,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, ASSERT_WDEV_LOCK(wdev); - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = NULL; if (wdev->conn) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 7cc887f9da11..625a6e6d1168 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -298,11 +298,6 @@ DEFINE_EVENT(wiphy_only_evt, rdev_return_void, TP_ARGS(wiphy) ); -DEFINE_EVENT(wiphy_only_evt, rdev_get_ringparam, - TP_PROTO(struct wiphy *wiphy), - TP_ARGS(wiphy) -); - DEFINE_EVENT(wiphy_only_evt, rdev_get_antenna, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy) @@ -580,11 +575,6 @@ DEFINE_EVENT(wiphy_netdev_evt, rdev_stop_ap, TP_ARGS(wiphy, netdev) ); -DEFINE_EVENT(wiphy_netdev_evt, rdev_get_et_stats, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), - TP_ARGS(wiphy, netdev) -); - DEFINE_EVENT(wiphy_netdev_evt, rdev_sched_scan_stop, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) @@ -1439,11 +1429,6 @@ DECLARE_EVENT_CLASS(tx_rx_evt, WIPHY_PR_ARG, __entry->tx, __entry->rx) ); -DEFINE_EVENT(tx_rx_evt, rdev_set_ringparam, - TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx), - TP_ARGS(wiphy, rx, tx) -); - DEFINE_EVENT(tx_rx_evt, rdev_set_antenna, TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx), TP_ARGS(wiphy, rx, tx) @@ -1469,9 +1454,9 @@ TRACE_EVENT(rdev_tdls_mgmt, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *peer, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, - const u8 *buf, size_t len), + bool initiator, const u8 *buf, size_t len), TP_ARGS(wiphy, netdev, peer, action_code, dialog_token, status_code, - peer_capability, buf, len), + peer_capability, initiator, buf, len), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY @@ -1480,6 +1465,7 @@ TRACE_EVENT(rdev_tdls_mgmt, __field(u8, dialog_token) __field(u16, status_code) __field(u32, peer_capability) + __field(bool, initiator) __dynamic_array(u8, buf, len) ), TP_fast_assign( @@ -1490,13 +1476,16 @@ TRACE_EVENT(rdev_tdls_mgmt, __entry->dialog_token = dialog_token; __entry->status_code = status_code; __entry->peer_capability = peer_capability; + __entry->initiator = initiator; memcpy(__get_dynamic_array(buf), buf, len); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", action_code: %u, " - "dialog_token: %u, status_code: %u, peer_capability: %u buf: %#.2x ", + "dialog_token: %u, status_code: %u, peer_capability: %u " + "initiator: %s buf: %#.2x ", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->action_code, __entry->dialog_token, __entry->status_code, __entry->peer_capability, + BOOL_TO_STR(__entry->initiator), ((u8 *)__get_dynamic_array(buf))[0]) ); @@ -1725,40 +1714,6 @@ TRACE_EVENT(rdev_set_noack_map, WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map) ); -TRACE_EVENT(rdev_get_et_sset_count, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int sset), - TP_ARGS(wiphy, netdev, sset), - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - __field(int, sset) - ), - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - __entry->sset = sset; - ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", sset: %d", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sset) -); - -TRACE_EVENT(rdev_get_et_strings, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 sset), - TP_ARGS(wiphy, netdev, sset), - TP_STRUCT__entry( - WIPHY_ENTRY - NETDEV_ENTRY - __field(u32, sset) - ), - TP_fast_assign( - WIPHY_ASSIGN; - NETDEV_ASSIGN; - __entry->sset = sset; - ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", sset: %u", - WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sset) -); - DEFINE_EVENT(wiphy_wdev_evt, rdev_get_channel, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) @@ -1941,6 +1896,51 @@ TRACE_EVENT(rdev_set_ap_chanwidth, WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) ); +TRACE_EVENT(rdev_add_tx_ts, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time), + TP_ARGS(wiphy, netdev, tsid, peer, user_prio, admitted_time), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tsid) + __field(u8, user_prio) + __field(u16, admitted_time) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tsid = tsid; + __entry->user_prio = user_prio; + __entry->admitted_time = admitted_time; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d, UP %d, time %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), + __entry->tsid, __entry->user_prio, __entry->admitted_time) +); + +TRACE_EVENT(rdev_del_tx_ts, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + u8 tsid, const u8 *peer), + TP_ARGS(wiphy, netdev, tsid, peer), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tsid) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tsid = tsid; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tsid) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ diff --git a/net/wireless/util.c b/net/wireless/util.c index 728f1c0dc70d..5e233a577d0f 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2,6 +2,7 @@ * Wireless utility functions * * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include <linux/export.h> #include <linux/bitops.h> @@ -796,7 +797,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) netdev_err(dev, "failed to set mgtdef %d\n", i); } - kfree(wdev->connect_keys); + kzfree(wdev->connect_keys); wdev->connect_keys = NULL; } diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 11120bb14162..0f47948c572f 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -496,6 +496,8 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, err = 0; if (!err) { if (!addr) { + memset(wdev->wext.keys->data[idx], 0, + sizeof(wdev->wext.keys->data[idx])); wdev->wext.keys->params[idx].key_len = 0; wdev->wext.keys->params[idx].cipher = 0; } diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index c7e5c8eb4f24..368611c05739 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -57,7 +57,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, err = cfg80211_connect(rdev, wdev->netdev, &wdev->wext.connect, ck, prev_bssid); if (err) - kfree(ck); + kzfree(ck); return err; } diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h index 0622d319e1f2..666c5ffe929d 100644 --- a/net/xfrm/xfrm_hash.h +++ b/net/xfrm/xfrm_hash.h @@ -3,6 +3,7 @@ #include <linux/xfrm.h> #include <linux/socket.h> +#include <linux/jhash.h> static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr) { @@ -28,6 +29,58 @@ static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr, saddr->a6[2] ^ saddr->a6[3]); } +static inline u32 __bits2mask32(__u8 bits) +{ + u32 mask32 = 0xffffffff; + + if (bits == 0) + mask32 = 0; + else if (bits < 32) + mask32 <<= (32 - bits); + + return mask32; +} + +static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + __u8 dbits, + __u8 sbits) +{ + return jhash_2words(ntohl(daddr->a4) & __bits2mask32(dbits), + ntohl(saddr->a4) & __bits2mask32(sbits), + 0); +} + +static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr, + __u8 prefixlen) +{ + int pdw; + int pbi; + u32 initval = 0; + + pdw = prefixlen >> 5; /* num of whole u32 in prefix */ + pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ + + if (pbi) { + __be32 mask; + + mask = htonl((0xffffffff) << (32 - pbi)); + + initval = (__force u32)(addr->a6[pdw] & mask); + } + + return jhash2((__force u32 *)addr->a6, pdw, initval); +} + +static inline unsigned int __xfrm6_dpref_spref_hash(const xfrm_address_t *daddr, + const xfrm_address_t *saddr, + __u8 dbits, + __u8 sbits) +{ + return __xfrm6_pref_hash(daddr, dbits) ^ + __xfrm6_pref_hash(saddr, sbits); +} + static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, u32 reqid, unsigned short family, @@ -84,7 +137,8 @@ static inline unsigned int __idx_hash(u32 index, unsigned int hmask) } static inline unsigned int __sel_hash(const struct xfrm_selector *sel, - unsigned short family, unsigned int hmask) + unsigned short family, unsigned int hmask, + u8 dbits, u8 sbits) { const xfrm_address_t *daddr = &sel->daddr; const xfrm_address_t *saddr = &sel->saddr; @@ -92,19 +146,19 @@ static inline unsigned int __sel_hash(const struct xfrm_selector *sel, switch (family) { case AF_INET: - if (sel->prefixlen_d != 32 || - sel->prefixlen_s != 32) + if (sel->prefixlen_d < dbits || + sel->prefixlen_s < sbits) return hmask + 1; - h = __xfrm4_daddr_saddr_hash(daddr, saddr); + h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: - if (sel->prefixlen_d != 128 || - sel->prefixlen_s != 128) + if (sel->prefixlen_d < dbits || + sel->prefixlen_s < sbits) return hmask + 1; - h = __xfrm6_daddr_saddr_hash(daddr, saddr); + h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); @@ -113,17 +167,19 @@ static inline unsigned int __sel_hash(const struct xfrm_selector *sel, static inline unsigned int __addr_hash(const xfrm_address_t *daddr, const xfrm_address_t *saddr, - unsigned short family, unsigned int hmask) + unsigned short family, + unsigned int hmask, + u8 dbits, u8 sbits) { unsigned int h = 0; switch (family) { case AF_INET: - h = __xfrm4_daddr_saddr_hash(daddr, saddr); + h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits); break; case AF_INET6: - h = __xfrm6_daddr_saddr_hash(daddr, saddr); + h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits); break; } h ^= (h >> 16); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index c51e8f7b8653..499d6c18a8ce 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -166,11 +166,7 @@ static int xfrm_output_gso(struct sk_buff *skb) err = xfrm_output2(segs); if (unlikely(err)) { - while ((segs = nskb)) { - nskb = segs->next; - segs->next = NULL; - kfree_skb(segs); - } + kfree_skb_list(nskb); return err; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0525d78ba328..4c4e457e7888 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -39,6 +39,11 @@ #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ)) #define XFRM_MAX_QUEUE_LEN 100 +struct xfrm_flo { + struct dst_entry *dst_orig; + u8 flags; +}; + static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO] __read_mostly; @@ -344,12 +349,39 @@ static inline unsigned int idx_hash(struct net *net, u32 index) return __idx_hash(index, net->xfrm.policy_idx_hmask); } +/* calculate policy hash thresholds */ +static void __get_hash_thresh(struct net *net, + unsigned short family, int dir, + u8 *dbits, u8 *sbits) +{ + switch (family) { + case AF_INET: + *dbits = net->xfrm.policy_bydst[dir].dbits4; + *sbits = net->xfrm.policy_bydst[dir].sbits4; + break; + + case AF_INET6: + *dbits = net->xfrm.policy_bydst[dir].dbits6; + *sbits = net->xfrm.policy_bydst[dir].sbits6; + break; + + default: + *dbits = 0; + *sbits = 0; + } +} + static struct hlist_head *policy_hash_bysel(struct net *net, const struct xfrm_selector *sel, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; - unsigned int hash = __sel_hash(sel, family, hmask); + unsigned int hash; + u8 dbits; + u8 sbits; + + __get_hash_thresh(net, family, dir, &dbits, &sbits); + hash = __sel_hash(sel, family, hmask, dbits, sbits); return (hash == hmask + 1 ? &net->xfrm.policy_inexact[dir] : @@ -362,25 +394,35 @@ static struct hlist_head *policy_hash_direct(struct net *net, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; - unsigned int hash = __addr_hash(daddr, saddr, family, hmask); + unsigned int hash; + u8 dbits; + u8 sbits; + + __get_hash_thresh(net, family, dir, &dbits, &sbits); + hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); return net->xfrm.policy_bydst[dir].table + hash; } -static void xfrm_dst_hash_transfer(struct hlist_head *list, +static void xfrm_dst_hash_transfer(struct net *net, + struct hlist_head *list, struct hlist_head *ndsttable, - unsigned int nhashmask) + unsigned int nhashmask, + int dir) { struct hlist_node *tmp, *entry0 = NULL; struct xfrm_policy *pol; unsigned int h0 = 0; + u8 dbits; + u8 sbits; redo: hlist_for_each_entry_safe(pol, tmp, list, bydst) { unsigned int h; + __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, - pol->family, nhashmask); + pol->family, nhashmask, dbits, sbits); if (!entry0) { hlist_del(&pol->bydst); hlist_add_head(&pol->bydst, ndsttable+h); @@ -389,7 +431,7 @@ redo: if (h != h0) continue; hlist_del(&pol->bydst); - hlist_add_after(entry0, &pol->bydst); + hlist_add_behind(&pol->bydst, entry0); } entry0 = &pol->bydst; } @@ -434,7 +476,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) write_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) - xfrm_dst_hash_transfer(odst + i, ndst, nhashmask); + xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); net->xfrm.policy_bydst[dir].table = ndst; net->xfrm.policy_bydst[dir].hmask = nhashmask; @@ -529,6 +571,86 @@ static void xfrm_hash_resize(struct work_struct *work) mutex_unlock(&hash_resize_mutex); } +static void xfrm_hash_rebuild(struct work_struct *work) +{ + struct net *net = container_of(work, struct net, + xfrm.policy_hthresh.work); + unsigned int hmask; + struct xfrm_policy *pol; + struct xfrm_policy *policy; + struct hlist_head *chain; + struct hlist_head *odst; + struct hlist_node *newpos; + int i; + int dir; + unsigned seq; + u8 lbits4, rbits4, lbits6, rbits6; + + mutex_lock(&hash_resize_mutex); + + /* read selector prefixlen thresholds */ + do { + seq = read_seqbegin(&net->xfrm.policy_hthresh.lock); + + lbits4 = net->xfrm.policy_hthresh.lbits4; + rbits4 = net->xfrm.policy_hthresh.rbits4; + lbits6 = net->xfrm.policy_hthresh.lbits6; + rbits6 = net->xfrm.policy_hthresh.rbits6; + } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); + + write_lock_bh(&net->xfrm.xfrm_policy_lock); + + /* reset the bydst and inexact table in all directions */ + for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { + INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); + hmask = net->xfrm.policy_bydst[dir].hmask; + odst = net->xfrm.policy_bydst[dir].table; + for (i = hmask; i >= 0; i--) + INIT_HLIST_HEAD(odst + i); + if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { + /* dir out => dst = remote, src = local */ + net->xfrm.policy_bydst[dir].dbits4 = rbits4; + net->xfrm.policy_bydst[dir].sbits4 = lbits4; + net->xfrm.policy_bydst[dir].dbits6 = rbits6; + net->xfrm.policy_bydst[dir].sbits6 = lbits6; + } else { + /* dir in/fwd => dst = local, src = remote */ + net->xfrm.policy_bydst[dir].dbits4 = lbits4; + net->xfrm.policy_bydst[dir].sbits4 = rbits4; + net->xfrm.policy_bydst[dir].dbits6 = lbits6; + net->xfrm.policy_bydst[dir].sbits6 = rbits6; + } + } + + /* re-insert all policies by order of creation */ + list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + newpos = NULL; + chain = policy_hash_bysel(net, &policy->selector, + policy->family, + xfrm_policy_id2dir(policy->index)); + hlist_for_each_entry(pol, chain, bydst) { + if (policy->priority >= pol->priority) + newpos = &pol->bydst; + else + break; + } + if (newpos) + hlist_add_behind(&policy->bydst, newpos); + else + hlist_add_head(&policy->bydst, chain); + } + + write_unlock_bh(&net->xfrm.xfrm_policy_lock); + + mutex_unlock(&hash_resize_mutex); +} + +void xfrm_policy_hash_rebuild(struct net *net) +{ + schedule_work(&net->xfrm.policy_hthresh.work); +} +EXPORT_SYMBOL(xfrm_policy_hash_rebuild); + /* Generate new index... KAME seems to generate them ordered by cost * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) @@ -654,7 +776,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) break; } if (newpos) - hlist_add_after(newpos, &policy->bydst); + hlist_add_behind(&policy->bydst, newpos); else hlist_add_head(&policy->bydst, chain); xfrm_pol_hold(policy); @@ -1839,10 +1961,8 @@ static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) struct xfrm_dst *xdst = (struct xfrm_dst *) dst; struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; - const struct sk_buff *fclone = skb + 1; - if (unlikely(skb->fclone == SKB_FCLONE_ORIG && - fclone->fclone == SKB_FCLONE_CLONE)) { + if (unlikely(skb_fclone_busy(skb))) { kfree_skb(skb); return 0; } @@ -1877,13 +1997,14 @@ static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) } static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, - struct dst_entry *dst, + struct xfrm_flo *xflo, const struct flowi *fl, int num_xfrms, u16 family) { int err; struct net_device *dev; + struct dst_entry *dst; struct dst_entry *dst1; struct xfrm_dst *xdst; @@ -1891,9 +2012,12 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, if (IS_ERR(xdst)) return xdst; - if (net->xfrm.sysctl_larval_drop || num_xfrms <= 0) + if (!(xflo->flags & XFRM_LOOKUP_QUEUE) || + net->xfrm.sysctl_larval_drop || + num_xfrms <= 0) return xdst; + dst = xflo->dst_orig; dst1 = &xdst->u.dst; dst_hold(dst); xdst->route = dst; @@ -1935,7 +2059,7 @@ static struct flow_cache_object * xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct flow_cache_object *oldflo, void *ctx) { - struct dst_entry *dst_orig = (struct dst_entry *)ctx; + struct xfrm_flo *xflo = (struct xfrm_flo *)ctx; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct xfrm_dst *xdst, *new_xdst; int num_pols = 0, num_xfrms = 0, i, err, pol_dead; @@ -1976,7 +2100,8 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, goto make_dummy_bundle; } - new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig); + new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, + xflo->dst_orig); if (IS_ERR(new_xdst)) { err = PTR_ERR(new_xdst); if (err != -EAGAIN) @@ -2010,7 +2135,7 @@ make_dummy_bundle: /* We found policies, but there's no bundles to instantiate: * either because the policy blocks, has no transformations or * we could not build template (no xfrm_states).*/ - xdst = xfrm_create_dummy_bundle(net, dst_orig, fl, num_xfrms, family); + xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); return ERR_CAST(xdst); @@ -2104,13 +2229,18 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, } if (xdst == NULL) { + struct xfrm_flo xflo; + + xflo.dst_orig = dst_orig; + xflo.flags = flags; + /* To accelerate a bit... */ if ((dst_orig->flags & DST_NOXFRM) || !net->xfrm.policy_count[XFRM_POLICY_OUT]) goto nopol; flo = flow_cache_lookup(net, fl, family, dir, - xfrm_bundle_lookup, dst_orig); + xfrm_bundle_lookup, &xflo); if (flo == NULL) goto nopol; if (IS_ERR(flo)) { @@ -2138,7 +2268,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, xfrm_pols_put(pols, drop_pols); XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); - return make_blackhole(net, family, dst_orig); + return ERR_PTR(-EREMOTE); } err = -EAGAIN; @@ -2195,6 +2325,23 @@ dropdst: } EXPORT_SYMBOL(xfrm_lookup); +/* Callers of xfrm_lookup_route() must ensure a call to dst_output(). + * Otherwise we may send out blackholed packets. + */ +struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, + const struct flowi *fl, + struct sock *sk, int flags) +{ + struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, + flags | XFRM_LOOKUP_QUEUE); + + if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) + return make_blackhole(net, dst_orig->ops->family, dst_orig); + + return dst; +} +EXPORT_SYMBOL(xfrm_lookup_route); + static inline int xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) { @@ -2460,7 +2607,7 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) skb_dst_force(skb); - dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, 0); + dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; @@ -2830,10 +2977,21 @@ static int __net_init xfrm_policy_init(struct net *net) if (!htab->table) goto out_bydst; htab->hmask = hmask; + htab->dbits4 = 32; + htab->sbits4 = 32; + htab->dbits6 = 128; + htab->sbits6 = 128; } + net->xfrm.policy_hthresh.lbits4 = 32; + net->xfrm.policy_hthresh.rbits4 = 32; + net->xfrm.policy_hthresh.lbits6 = 128; + net->xfrm.policy_hthresh.rbits6 = 128; + + seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); + INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); if (net_eq(net, &init_net)) register_netdevice_notifier(&xfrm_dev_notifier); return 0; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 0ab54134bb40..de971b6d38c5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -97,8 +97,6 @@ static unsigned long xfrm_hash_new_size(unsigned int state_hmask) return ((state_hmask + 1) << 1) * sizeof(struct hlist_head); } -static DEFINE_MUTEX(hash_resize_mutex); - static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.state_hash_work); @@ -107,22 +105,20 @@ static void xfrm_hash_resize(struct work_struct *work) unsigned int nhashmask, ohashmask; int i; - mutex_lock(&hash_resize_mutex); - nsize = xfrm_hash_new_size(net->xfrm.state_hmask); ndst = xfrm_hash_alloc(nsize); if (!ndst) - goto out_unlock; + return; nsrc = xfrm_hash_alloc(nsize); if (!nsrc) { xfrm_hash_free(ndst, nsize); - goto out_unlock; + return; } nspi = xfrm_hash_alloc(nsize); if (!nspi) { xfrm_hash_free(ndst, nsize); xfrm_hash_free(nsrc, nsize); - goto out_unlock; + return; } spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -148,9 +144,6 @@ static void xfrm_hash_resize(struct work_struct *work) xfrm_hash_free(odst, osize); xfrm_hash_free(osrc, osize); xfrm_hash_free(ospi, osize); - -out_unlock: - mutex_unlock(&hash_resize_mutex); } static DEFINE_SPINLOCK(xfrm_state_afinfo_lock); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d4db6ebb089d..e812e988c111 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -333,8 +333,7 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, algo = xfrm_aalg_get_byname(ualg->alg_name, 1); if (!algo) return -ENOSYS; - if ((ualg->alg_trunc_len / 8) > MAX_AH_AUTH_LEN || - ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) + if (ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) return -EINVAL; *props = algo->desc.sadb_alg_id; @@ -964,7 +963,9 @@ static inline size_t xfrm_spdinfo_msgsize(void) { return NLMSG_ALIGN(4) + nla_total_size(sizeof(struct xfrmu_spdinfo)) - + nla_total_size(sizeof(struct xfrmu_spdhinfo)); + + nla_total_size(sizeof(struct xfrmu_spdhinfo)) + + nla_total_size(sizeof(struct xfrmu_spdhthresh)) + + nla_total_size(sizeof(struct xfrmu_spdhthresh)); } static int build_spdinfo(struct sk_buff *skb, struct net *net, @@ -973,9 +974,11 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, struct xfrmk_spdinfo si; struct xfrmu_spdinfo spc; struct xfrmu_spdhinfo sph; + struct xfrmu_spdhthresh spt4, spt6; struct nlmsghdr *nlh; int err; u32 *f; + unsigned lseq; nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0); if (nlh == NULL) /* shouldn't really happen ... */ @@ -993,9 +996,22 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, sph.spdhcnt = si.spdhcnt; sph.spdhmcnt = si.spdhmcnt; + do { + lseq = read_seqbegin(&net->xfrm.policy_hthresh.lock); + + spt4.lbits = net->xfrm.policy_hthresh.lbits4; + spt4.rbits = net->xfrm.policy_hthresh.rbits4; + spt6.lbits = net->xfrm.policy_hthresh.lbits6; + spt6.rbits = net->xfrm.policy_hthresh.rbits6; + } while (read_seqretry(&net->xfrm.policy_hthresh.lock, lseq)); + err = nla_put(skb, XFRMA_SPD_INFO, sizeof(spc), &spc); if (!err) err = nla_put(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph); + if (!err) + err = nla_put(skb, XFRMA_SPD_IPV4_HTHRESH, sizeof(spt4), &spt4); + if (!err) + err = nla_put(skb, XFRMA_SPD_IPV6_HTHRESH, sizeof(spt6), &spt6); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -1004,6 +1020,51 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, return nlmsg_end(skb, nlh); } +static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrmu_spdhthresh *thresh4 = NULL; + struct xfrmu_spdhthresh *thresh6 = NULL; + + /* selector prefixlen thresholds to hash policies */ + if (attrs[XFRMA_SPD_IPV4_HTHRESH]) { + struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH]; + + if (nla_len(rta) < sizeof(*thresh4)) + return -EINVAL; + thresh4 = nla_data(rta); + if (thresh4->lbits > 32 || thresh4->rbits > 32) + return -EINVAL; + } + if (attrs[XFRMA_SPD_IPV6_HTHRESH]) { + struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH]; + + if (nla_len(rta) < sizeof(*thresh6)) + return -EINVAL; + thresh6 = nla_data(rta); + if (thresh6->lbits > 128 || thresh6->rbits > 128) + return -EINVAL; + } + + if (thresh4 || thresh6) { + write_seqlock(&net->xfrm.policy_hthresh.lock); + if (thresh4) { + net->xfrm.policy_hthresh.lbits4 = thresh4->lbits; + net->xfrm.policy_hthresh.rbits4 = thresh4->rbits; + } + if (thresh6) { + net->xfrm.policy_hthresh.lbits6 = thresh6->lbits; + net->xfrm.policy_hthresh.rbits6 = thresh6->rbits; + } + write_sequnlock(&net->xfrm.policy_hthresh.lock); + + xfrm_policy_hash_rebuild(net); + } + + return 0; +} + static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { @@ -2274,6 +2335,7 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report), [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), }; @@ -2308,10 +2370,17 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, }; +static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { + [XFRMA_SPD_IPV4_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) }, + [XFRMA_SPD_IPV6_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) }, +}; + static const struct xfrm_link { int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **); int (*dump)(struct sk_buff *, struct netlink_callback *); int (*done)(struct netlink_callback *); + const struct nla_policy *nla_pol; + int nla_max; } xfrm_dispatch[XFRM_NR_MSGTYPES] = { [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa }, [XFRM_MSG_DELSA - XFRM_MSG_BASE] = { .doit = xfrm_del_sa }, @@ -2335,6 +2404,9 @@ static const struct xfrm_link { [XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae }, [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate }, [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo }, + [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_set_spdinfo, + .nla_pol = xfrma_spd_policy, + .nla_max = XFRMA_SPD_MAX }, [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, }; @@ -2371,8 +2443,9 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } } - err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, XFRMA_MAX, - xfrma_policy); + err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, + link->nla_max ? : XFRMA_MAX, + link->nla_pol ? : xfrma_policy); if (err < 0) return err; |